Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ingest_files",) 

24 

25import logging 

26from typing import Optional, Tuple, Dict, Any, List, TYPE_CHECKING 

27from collections import defaultdict 

28 

29from astropy.table import Table 

30 

31from lsst.utils import doImport 

32 

33from .. import Butler, DatasetIdGenEnum 

34from ..core import FileDataset, DatasetRef, ButlerURI 

35 

36if TYPE_CHECKING: 36 ↛ 37line 36 didn't jump to line 37, because the condition on line 36 was never true

37 from ..core import DimensionUniverse, DatasetType 

38 

39log = logging.getLogger(__name__) 

40 

41 

42def ingest_files(repo: str, dataset_type: str, run: str, table_file: str, 

43 data_id: Tuple[str, ...] = (), 

44 formatter: Optional[str] = None, 

45 id_generation_mode: str = "UNIQUE", 

46 prefix: Optional[str] = None, 

47 transfer: str = "auto") -> None: 

48 """Ingest files from a table. 

49 

50 Parameters 

51 ---------- 

52 repo : `str` 

53 URI string of the Butler repo to use. 

54 dataset_type : `str` 

55 The name of the dataset type for the files to be ingested. This 

56 dataset type must exist. 

57 run : `str` 

58 The run in which the files should be ingested. 

59 table_file : `str` 

60 Path to a table file to read. This file can be in any format that 

61 can be read by Astropy so long as Astropy can determine the format 

62 itself. 

63 data_id : `tuple` of `str` 

64 Tuple of strings of the form ``keyword=value`` that can be used 

65 to define dataId elements that are fixed for all ingested files 

66 found in the table file. This allows those columns to be missing 

67 from the table file. Dimensions given here override table columns. 

68 formatter : `str`, optional 

69 Fully-qualified python class name for the `Formatter` to use 

70 to read the ingested files. If `None` the formatter is read from 

71 datastore configuration based on the dataset type. 

72 id_generation_mode : `str`, optional 

73 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`. 

74 prefix : `str`, optional 

75 Prefix to use when resolving relative paths in table files. The default 

76 is to use the current working directory. 

77 transfer : `str`, optional 

78 Transfer mode to use for ingest. 

79 """ 

80 

81 # Check that the formatter can be imported -- validate this as soon 

82 # as possible before we read a potentially large table file. 

83 if formatter: 

84 doImport(formatter) 

85 else: 

86 formatter = None 

87 

88 # Force empty string prefix (from click) to None for API compatibility. 

89 if not prefix: 

90 prefix = None 

91 

92 # Convert the dataset ID gen mode string to enum. 

93 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode] 

94 

95 # Create the butler with the relevant run attached. 

96 butler = Butler(repo, run=run) 

97 

98 datasetType = butler.registry.getDatasetType(dataset_type) 

99 

100 # Convert the k=v strings into a dataId dict. 

101 universe = butler.registry.dimensions 

102 common_data_id = parse_data_id_tuple(data_id, universe) 

103 

104 # Read the table assuming that Astropy can work out the format. 

105 table = Table.read(table_file) 

106 

107 datasets = extract_datasets_from_table(table, common_data_id, datasetType, formatter, prefix) 

108 

109 butler.ingest(*datasets, transfer=transfer, run=run, idGenerationMode=id_gen_mode) 

110 

111 

112def extract_datasets_from_table(table: Table, common_data_id: Dict, datasetType: DatasetType, 

113 formatter: Optional[str] = None, 

114 prefix: Optional[str] = None,) -> List[FileDataset]: 

115 """Extract datasets from the supplied table. 

116 

117 Parameters 

118 ---------- 

119 table : `astropy.table.Table` 

120 Table containing the datasets. The first column is assumed to be 

121 the file URI and the remaining columns are dimensions. 

122 common_data_id : `dict` 

123 Data ID values that are common to every row in the table. These 

124 take priority if a dimension in this dataId is also present as 

125 a column in the table. 

126 datasetType : `DatasetType` 

127 The dataset type to be associated with the ingested data. 

128 formatter : `str`, optional 

129 Fully-qualified python class name for the `Formatter` to use 

130 to read the ingested files. If `None` the formatter is read from 

131 datastore configuration based on the dataset type. 

132 prefix : `str` 

133 Prefix to be used for relative paths. Can be `None` for current 

134 working directory. 

135 

136 Returns 

137 ------- 

138 datasets : `list` of `FileDataset` 

139 The `FileDataset` objects corresponding to the rows in the table. 

140 The number of elements in this list can be smaller than the number 

141 of rows in the file because one file can appear in multiple rows 

142 with different dataIds. 

143 """ 

144 # The file is the first column and everything else is assumed to 

145 # be dimensions so we need to know the name of that column. 

146 file_column = table.colnames[0] 

147 

148 # Handle multiple dataIds per file by grouping by file. 

149 refs_by_file = defaultdict(list) 

150 n_dataset_refs = 0 

151 for row in table: 

152 

153 # Convert the row to a dataId, remembering to extract the 

154 # path column. 

155 dataId = dict(row) 

156 path = dataId.pop(file_column) 

157 

158 # The command line can override a column. 

159 dataId.update(common_data_id) 

160 

161 # Create the dataset ref that is to be ingested. 

162 ref = DatasetRef(datasetType, dataId) # type: ignore 

163 

164 # Convert path to absolute (because otherwise system will 

165 # assume relative to datastore root and that is almost certainly 

166 # never the right default here). 

167 path_uri = ButlerURI(path, root=prefix, forceAbsolute=True) 

168 

169 refs_by_file[path_uri].append(ref) 

170 n_dataset_refs += 1 

171 

172 datasets = [FileDataset(path=file_uri, 

173 refs=refs, 

174 formatter=formatter,) for file_uri, refs in refs_by_file.items()] 

175 

176 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets)) 

177 

178 return datasets 

179 

180 

181def parse_data_id_tuple(data_ids: Tuple[str, ...], universe: DimensionUniverse) -> Dict[str, Any]: 

182 # Convert any additional k=v strings in the dataId tuple to dict 

183 # form. 

184 data_id: Dict[str, Any] = {} 

185 for id_str in data_ids: 

186 dimension_str, value = id_str.split("=") 

187 

188 try: 

189 dimension = universe.getStaticDimensions()[dimension_str] 

190 except KeyError: 

191 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None 

192 

193 # Cast the value to the right python type (since they will be 

194 # strings at this point). 

195 value = dimension.primaryKey.getPythonType()(value) 

196 

197 data_id[dimension_str] = value 

198 return data_id