Coverage for python/lsst/daf/butler/script/ingest_files.py: 25%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

53 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ingest_files",) 

24 

25import logging 

26from collections import defaultdict 

27from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple 

28 

29from astropy.table import Table 

30from lsst.utils import doImport 

31 

32from .._butler import Butler 

33from ..core import ButlerURI, DatasetRef, FileDataset 

34from ..registry import DatasetIdGenEnum 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 from ..core import DatasetType, DimensionUniverse 

38 

39log = logging.getLogger(__name__) 

40 

41 

42def ingest_files( 

43 repo: str, 

44 dataset_type: str, 

45 run: str, 

46 table_file: str, 

47 data_id: Tuple[str, ...] = (), 

48 formatter: Optional[str] = None, 

49 id_generation_mode: str = "UNIQUE", 

50 prefix: Optional[str] = None, 

51 transfer: str = "auto", 

52) -> None: 

53 """Ingest files from a table. 

54 

55 Parameters 

56 ---------- 

57 repo : `str` 

58 URI string of the Butler repo to use. 

59 dataset_type : `str` 

60 The name of the dataset type for the files to be ingested. This 

61 dataset type must exist. 

62 run : `str` 

63 The run in which the files should be ingested. 

64 table_file : `str` 

65 Path to a table file to read. This file can be in any format that 

66 can be read by Astropy so long as Astropy can determine the format 

67 itself. 

68 data_id : `tuple` of `str` 

69 Tuple of strings of the form ``keyword=value`` that can be used 

70 to define dataId elements that are fixed for all ingested files 

71 found in the table file. This allows those columns to be missing 

72 from the table file. Dimensions given here override table columns. 

73 formatter : `str`, optional 

74 Fully-qualified python class name for the `Formatter` to use 

75 to read the ingested files. If `None` the formatter is read from 

76 datastore configuration based on the dataset type. 

77 id_generation_mode : `str`, optional 

78 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`. 

79 prefix : `str`, optional 

80 Prefix to use when resolving relative paths in table files. The default 

81 is to use the current working directory. 

82 transfer : `str`, optional 

83 Transfer mode to use for ingest. 

84 """ 

85 

86 # Check that the formatter can be imported -- validate this as soon 

87 # as possible before we read a potentially large table file. 

88 if formatter: 

89 doImport(formatter) 

90 else: 

91 formatter = None 

92 

93 # Force empty string prefix (from click) to None for API compatibility. 

94 if not prefix: 

95 prefix = None 

96 

97 # Convert the dataset ID gen mode string to enum. 

98 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode] 

99 

100 # Create the butler with the relevant run attached. 

101 butler = Butler(repo, run=run) 

102 

103 datasetType = butler.registry.getDatasetType(dataset_type) 

104 

105 # Convert the k=v strings into a dataId dict. 

106 universe = butler.registry.dimensions 

107 common_data_id = parse_data_id_tuple(data_id, universe) 

108 

109 # Read the table assuming that Astropy can work out the format. 

110 table = Table.read(table_file) 

111 

112 datasets = extract_datasets_from_table(table, common_data_id, datasetType, formatter, prefix) 

113 

114 butler.ingest(*datasets, transfer=transfer, run=run, idGenerationMode=id_gen_mode) 

115 

116 

117def extract_datasets_from_table( 

118 table: Table, 

119 common_data_id: Dict, 

120 datasetType: DatasetType, 

121 formatter: Optional[str] = None, 

122 prefix: Optional[str] = None, 

123) -> List[FileDataset]: 

124 """Extract datasets from the supplied table. 

125 

126 Parameters 

127 ---------- 

128 table : `astropy.table.Table` 

129 Table containing the datasets. The first column is assumed to be 

130 the file URI and the remaining columns are dimensions. 

131 common_data_id : `dict` 

132 Data ID values that are common to every row in the table. These 

133 take priority if a dimension in this dataId is also present as 

134 a column in the table. 

135 datasetType : `DatasetType` 

136 The dataset type to be associated with the ingested data. 

137 formatter : `str`, optional 

138 Fully-qualified python class name for the `Formatter` to use 

139 to read the ingested files. If `None` the formatter is read from 

140 datastore configuration based on the dataset type. 

141 prefix : `str` 

142 Prefix to be used for relative paths. Can be `None` for current 

143 working directory. 

144 

145 Returns 

146 ------- 

147 datasets : `list` of `FileDataset` 

148 The `FileDataset` objects corresponding to the rows in the table. 

149 The number of elements in this list can be smaller than the number 

150 of rows in the file because one file can appear in multiple rows 

151 with different dataIds. 

152 """ 

153 # The file is the first column and everything else is assumed to 

154 # be dimensions so we need to know the name of that column. 

155 file_column = table.colnames[0] 

156 

157 # Handle multiple dataIds per file by grouping by file. 

158 refs_by_file = defaultdict(list) 

159 n_dataset_refs = 0 

160 for row in table: 

161 

162 # Convert the row to a dataId, remembering to extract the 

163 # path column. 

164 dataId = dict(row) 

165 path = dataId.pop(file_column) 

166 

167 # The command line can override a column. 

168 dataId.update(common_data_id) 

169 

170 # Create the dataset ref that is to be ingested. 

171 ref = DatasetRef(datasetType, dataId) # type: ignore 

172 

173 # Convert path to absolute (because otherwise system will 

174 # assume relative to datastore root and that is almost certainly 

175 # never the right default here). 

176 path_uri = ButlerURI(path, root=prefix, forceAbsolute=True) 

177 

178 refs_by_file[path_uri].append(ref) 

179 n_dataset_refs += 1 

180 

181 datasets = [ 

182 FileDataset( 

183 path=file_uri, 

184 refs=refs, 

185 formatter=formatter, 

186 ) 

187 for file_uri, refs in refs_by_file.items() 

188 ] 

189 

190 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets)) 

191 

192 return datasets 

193 

194 

195def parse_data_id_tuple(data_ids: Tuple[str, ...], universe: DimensionUniverse) -> Dict[str, Any]: 

196 # Convert any additional k=v strings in the dataId tuple to dict 

197 # form. 

198 data_id: Dict[str, Any] = {} 

199 for id_str in data_ids: 

200 dimension_str, value = id_str.split("=") 

201 

202 try: 

203 dimension = universe.getStaticDimensions()[dimension_str] 

204 except KeyError: 

205 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None 

206 

207 # Cast the value to the right python type (since they will be 

208 # strings at this point). 

209 value = dimension.primaryKey.getPythonType()(value) 

210 

211 data_id[dimension_str] = value 

212 return data_id