Coverage for python/lsst/daf/butler/script/ingest_files.py: 23%

54 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-04-19 03:42 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23__all__ = ("ingest_files",) 

24 

25import logging 

26from collections import defaultdict 

27from typing import TYPE_CHECKING, Any 

28 

29from astropy.table import Table 

30from lsst.resources import ResourcePath 

31from lsst.utils import doImport 

32 

33from .._butler import Butler 

34from ..core import DatasetRef, FileDataset 

35from ..registry import DatasetIdGenEnum 

36 

37if TYPE_CHECKING: 

38 from ..core import DatasetType, DimensionUniverse 

39 

40log = logging.getLogger(__name__) 

41 

42 

43def ingest_files( 

44 repo: str, 

45 dataset_type: str, 

46 run: str, 

47 table_file: str, 

48 data_id: tuple[str, ...] = (), 

49 formatter: str | None = None, 

50 id_generation_mode: str = "UNIQUE", 

51 prefix: str | None = None, 

52 transfer: str = "auto", 

53) -> None: 

54 """Ingest files from a table. 

55 

56 Parameters 

57 ---------- 

58 repo : `str` 

59 URI string of the Butler repo to use. 

60 dataset_type : `str` 

61 The name of the dataset type for the files to be ingested. This 

62 dataset type must exist. 

63 run : `str` 

64 The run in which the files should be ingested. 

65 table_file : `str` 

66 Path to a table file to read. This file can be in any format that 

67 can be read by Astropy so long as Astropy can determine the format 

68 itself. 

69 data_id : `tuple` of `str` 

70 Tuple of strings of the form ``keyword=value`` that can be used 

71 to define dataId elements that are fixed for all ingested files 

72 found in the table file. This allows those columns to be missing 

73 from the table file. Dimensions given here override table columns. 

74 formatter : `str`, optional 

75 Fully-qualified python class name for the `Formatter` to use 

76 to read the ingested files. If `None` the formatter is read from 

77 datastore configuration based on the dataset type. 

78 id_generation_mode : `str`, optional 

79 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`. 

80 prefix : `str`, optional 

81 Prefix to use when resolving relative paths in table files. The default 

82 is to use the current working directory. 

83 transfer : `str`, optional 

84 Transfer mode to use for ingest. 

85 """ 

86 

87 # Check that the formatter can be imported -- validate this as soon 

88 # as possible before we read a potentially large table file. 

89 if formatter: 

90 doImport(formatter) 

91 else: 

92 formatter = None 

93 

94 # Force empty string prefix (from click) to None for API compatibility. 

95 if not prefix: 

96 prefix = None 

97 

98 # Convert the dataset ID gen mode string to enum. 

99 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode] 

100 

101 # Create the butler with the relevant run attached. 

102 butler = Butler(repo, run=run) 

103 

104 datasetType = butler.registry.getDatasetType(dataset_type) 

105 

106 # Convert the k=v strings into a dataId dict. 

107 universe = butler.registry.dimensions 

108 common_data_id = parse_data_id_tuple(data_id, universe) 

109 

110 # Read the table assuming that Astropy can work out the format. 

111 uri = ResourcePath(table_file, forceAbsolute=False) 

112 with uri.as_local() as local_file: 

113 table = Table.read(local_file.ospath) 

114 

115 datasets = extract_datasets_from_table(table, common_data_id, datasetType, formatter, prefix) 

116 

117 butler.ingest(*datasets, transfer=transfer, run=run, idGenerationMode=id_gen_mode) 

118 

119 

120def extract_datasets_from_table( 

121 table: Table, 

122 common_data_id: dict, 

123 datasetType: DatasetType, 

124 formatter: str | None = None, 

125 prefix: str | None = None, 

126) -> list[FileDataset]: 

127 """Extract datasets from the supplied table. 

128 

129 Parameters 

130 ---------- 

131 table : `astropy.table.Table` 

132 Table containing the datasets. The first column is assumed to be 

133 the file URI and the remaining columns are dimensions. 

134 common_data_id : `dict` 

135 Data ID values that are common to every row in the table. These 

136 take priority if a dimension in this dataId is also present as 

137 a column in the table. 

138 datasetType : `DatasetType` 

139 The dataset type to be associated with the ingested data. 

140 formatter : `str`, optional 

141 Fully-qualified python class name for the `Formatter` to use 

142 to read the ingested files. If `None` the formatter is read from 

143 datastore configuration based on the dataset type. 

144 prefix : `str` 

145 Prefix to be used for relative paths. Can be `None` for current 

146 working directory. 

147 

148 Returns 

149 ------- 

150 datasets : `list` of `FileDataset` 

151 The `FileDataset` objects corresponding to the rows in the table. 

152 The number of elements in this list can be smaller than the number 

153 of rows in the file because one file can appear in multiple rows 

154 with different dataIds. 

155 """ 

156 # The file is the first column and everything else is assumed to 

157 # be dimensions so we need to know the name of that column. 

158 file_column = table.colnames[0] 

159 

160 # Handle multiple dataIds per file by grouping by file. 

161 refs_by_file = defaultdict(list) 

162 n_dataset_refs = 0 

163 for row in table: 

164 # Convert the row to a dataId, remembering to extract the 

165 # path column. 

166 dataId = dict(row) 

167 path = dataId.pop(file_column) 

168 

169 # The command line can override a column. 

170 dataId.update(common_data_id) 

171 

172 # Create the dataset ref that is to be ingested. 

173 ref = DatasetRef(datasetType, dataId) # type: ignore 

174 

175 # Convert path to absolute (because otherwise system will 

176 # assume relative to datastore root and that is almost certainly 

177 # never the right default here). 

178 path_uri = ResourcePath(path, root=prefix, forceAbsolute=True) 

179 

180 refs_by_file[path_uri].append(ref) 

181 n_dataset_refs += 1 

182 

183 datasets = [ 

184 FileDataset( 

185 path=file_uri, 

186 refs=refs, 

187 formatter=formatter, 

188 ) 

189 for file_uri, refs in refs_by_file.items() 

190 ] 

191 

192 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets)) 

193 

194 return datasets 

195 

196 

197def parse_data_id_tuple(data_ids: tuple[str, ...], universe: DimensionUniverse) -> dict[str, Any]: 

198 # Convert any additional k=v strings in the dataId tuple to dict 

199 # form. 

200 data_id: dict[str, Any] = {} 

201 for id_str in data_ids: 

202 dimension_str, value = id_str.split("=") 

203 

204 try: 

205 dimension = universe.getStaticDimensions()[dimension_str] 

206 except KeyError: 

207 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None 

208 

209 # Cast the value to the right python type (since they will be 

210 # strings at this point). 

211 value = dimension.primaryKey.getPythonType()(value) 

212 

213 data_id[dimension_str] = value 

214 return data_id