Coverage for python/lsst/daf/butler/script/ingest_files.py: 22%

53 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ingest_files",) 

24 

25import logging 

26from collections import defaultdict 

27from typing import TYPE_CHECKING, Any 

28 

29from astropy.table import Table 

30from lsst.resources import ResourcePath 

31from lsst.utils import doImport 

32 

33from .._butler import Butler 

34from ..core import DatasetIdGenEnum, DatasetRef, FileDataset 

35 

36if TYPE_CHECKING: 

37 from ..core import DatasetType, DimensionUniverse 

38 

39log = logging.getLogger(__name__) 

40 

41 

42def ingest_files( 

43 repo: str, 

44 dataset_type: str, 

45 run: str, 

46 table_file: str, 

47 data_id: tuple[str, ...] = (), 

48 formatter: str | None = None, 

49 id_generation_mode: str = "UNIQUE", 

50 prefix: str | None = None, 

51 transfer: str = "auto", 

52) -> None: 

53 """Ingest files from a table. 

54 

55 Parameters 

56 ---------- 

57 repo : `str` 

58 URI string of the Butler repo to use. 

59 dataset_type : `str` 

60 The name of the dataset type for the files to be ingested. This 

61 dataset type must exist. 

62 run : `str` 

63 The run in which the files should be ingested. 

64 table_file : `str` 

65 Path to a table file to read. This file can be in any format that 

66 can be read by Astropy so long as Astropy can determine the format 

67 itself. 

68 data_id : `tuple` of `str` 

69 Tuple of strings of the form ``keyword=value`` that can be used 

70 to define dataId elements that are fixed for all ingested files 

71 found in the table file. This allows those columns to be missing 

72 from the table file. Dimensions given here override table columns. 

73 formatter : `str`, optional 

74 Fully-qualified python class name for the `Formatter` to use 

75 to read the ingested files. If `None` the formatter is read from 

76 datastore configuration based on the dataset type. 

77 id_generation_mode : `str`, optional 

78 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`. 

79 prefix : `str`, optional 

80 Prefix to use when resolving relative paths in table files. The default 

81 is to use the current working directory. 

82 transfer : `str`, optional 

83 Transfer mode to use for ingest. 

84 """ 

85 # Check that the formatter can be imported -- validate this as soon 

86 # as possible before we read a potentially large table file. 

87 if formatter: 

88 doImport(formatter) 

89 else: 

90 formatter = None 

91 

92 # Force empty string prefix (from click) to None for API compatibility. 

93 if not prefix: 

94 prefix = None 

95 

96 # Convert the dataset ID gen mode string to enum. 

97 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode] 

98 

99 # Create the butler with the relevant run attached. 

100 butler = Butler(repo, run=run) 

101 

102 datasetType = butler.registry.getDatasetType(dataset_type) 

103 

104 # Convert the k=v strings into a dataId dict. 

105 universe = butler.dimensions 

106 common_data_id = parse_data_id_tuple(data_id, universe) 

107 

108 # Read the table assuming that Astropy can work out the format. 

109 uri = ResourcePath(table_file, forceAbsolute=False) 

110 with uri.as_local() as local_file: 

111 table = Table.read(local_file.ospath) 

112 

113 datasets = extract_datasets_from_table( 

114 table, common_data_id, datasetType, run, formatter, prefix, id_gen_mode 

115 ) 

116 

117 butler.ingest(*datasets, transfer=transfer) 

118 

119 

120def extract_datasets_from_table( 

121 table: Table, 

122 common_data_id: dict, 

123 datasetType: DatasetType, 

124 run: str, 

125 formatter: str | None = None, 

126 prefix: str | None = None, 

127 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

128) -> list[FileDataset]: 

129 """Extract datasets from the supplied table. 

130 

131 Parameters 

132 ---------- 

133 table : `astropy.table.Table` 

134 Table containing the datasets. The first column is assumed to be 

135 the file URI and the remaining columns are dimensions. 

136 common_data_id : `dict` 

137 Data ID values that are common to every row in the table. These 

138 take priority if a dimension in this dataId is also present as 

139 a column in the table. 

140 datasetType : `DatasetType` 

141 The dataset type to be associated with the ingested data. 

142 run : `str` 

143 The name of the run that will be receiving these datasets. 

144 formatter : `str`, optional 

145 Fully-qualified python class name for the `Formatter` to use 

146 to read the ingested files. If `None` the formatter is read from 

147 datastore configuration based on the dataset type. 

148 prefix : `str`, optional 

149 Prefix to be used for relative paths. Can be `None` for current 

150 working directory. 

151 id_generation_mode: `DatasetIdGenEnum`, optional 

152 The mode to use when creating the dataset IDs. 

153 

154 Returns 

155 ------- 

156 datasets : `list` of `FileDataset` 

157 The `FileDataset` objects corresponding to the rows in the table. 

158 The number of elements in this list can be smaller than the number 

159 of rows in the file because one file can appear in multiple rows 

160 with different dataIds. 

161 """ 

162 # The file is the first column and everything else is assumed to 

163 # be dimensions so we need to know the name of that column. 

164 file_column = table.colnames[0] 

165 

166 # Handle multiple dataIds per file by grouping by file. 

167 refs_by_file = defaultdict(list) 

168 n_dataset_refs = 0 

169 for row in table: 

170 # Convert the row to a dataId, remembering to extract the 

171 # path column. 

172 dataId = dict(row) 

173 path = dataId.pop(file_column) 

174 

175 # The command line can override a column. 

176 dataId.update(common_data_id) 

177 

178 # Create the dataset ref that is to be ingested. 

179 ref = DatasetRef(datasetType, dataId, run=run, id_generation_mode=id_generation_mode) # type: ignore 

180 

181 # Convert path to absolute (because otherwise system will 

182 # assume relative to datastore root and that is almost certainly 

183 # never the right default here). 

184 path_uri = ResourcePath(path, root=prefix, forceAbsolute=True) 

185 

186 refs_by_file[path_uri].append(ref) 

187 n_dataset_refs += 1 

188 

189 datasets = [ 

190 FileDataset( 

191 path=file_uri, 

192 refs=refs, 

193 formatter=formatter, 

194 ) 

195 for file_uri, refs in refs_by_file.items() 

196 ] 

197 

198 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets)) 

199 

200 return datasets 

201 

202 

203def parse_data_id_tuple(data_ids: tuple[str, ...], universe: DimensionUniverse) -> dict[str, Any]: 

204 """Convert any additional k=v strings in the dataId tuple to dict 

205 form. 

206 """ 

207 data_id: dict[str, Any] = {} 

208 for id_str in data_ids: 

209 dimension_str, value = id_str.split("=") 

210 

211 try: 

212 dimension = universe.getStaticDimensions()[dimension_str] 

213 except KeyError: 

214 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None 

215 

216 # Cast the value to the right python type (since they will be 

217 # strings at this point). 

218 value = dimension.primaryKey.getPythonType()(value) 

219 

220 data_id[dimension_str] = value 

221 return data_id