Coverage for python/lsst/daf/butler/script/ingest_files.py: 22%

53 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-10-02 08:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("ingest_files",) 

30 

31import logging 

32from collections import defaultdict 

33from typing import TYPE_CHECKING, Any 

34 

35from astropy.table import Table 

36from lsst.resources import ResourcePath 

37from lsst.utils import doImport 

38 

39from .._butler import Butler 

40from ..core import DatasetIdGenEnum, DatasetRef, FileDataset 

41 

42if TYPE_CHECKING: 

43 from ..core import DatasetType, DimensionUniverse 

44 

45log = logging.getLogger(__name__) 

46 

47 

48def ingest_files( 

49 repo: str, 

50 dataset_type: str, 

51 run: str, 

52 table_file: str, 

53 data_id: tuple[str, ...] = (), 

54 formatter: str | None = None, 

55 id_generation_mode: str = "UNIQUE", 

56 prefix: str | None = None, 

57 transfer: str = "auto", 

58) -> None: 

59 """Ingest files from a table. 

60 

61 Parameters 

62 ---------- 

63 repo : `str` 

64 URI string of the Butler repo to use. 

65 dataset_type : `str` 

66 The name of the dataset type for the files to be ingested. This 

67 dataset type must exist. 

68 run : `str` 

69 The run in which the files should be ingested. 

70 table_file : `str` 

71 Path to a table file to read. This file can be in any format that 

72 can be read by Astropy so long as Astropy can determine the format 

73 itself. 

74 data_id : `tuple` of `str` 

75 Tuple of strings of the form ``keyword=value`` that can be used 

76 to define dataId elements that are fixed for all ingested files 

77 found in the table file. This allows those columns to be missing 

78 from the table file. Dimensions given here override table columns. 

79 formatter : `str`, optional 

80 Fully-qualified python class name for the `Formatter` to use 

81 to read the ingested files. If `None` the formatter is read from 

82 datastore configuration based on the dataset type. 

83 id_generation_mode : `str`, optional 

84 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`. 

85 prefix : `str`, optional 

86 Prefix to use when resolving relative paths in table files. The default 

87 is to use the current working directory. 

88 transfer : `str`, optional 

89 Transfer mode to use for ingest. 

90 """ 

91 # Check that the formatter can be imported -- validate this as soon 

92 # as possible before we read a potentially large table file. 

93 if formatter: 

94 doImport(formatter) 

95 else: 

96 formatter = None 

97 

98 # Force empty string prefix (from click) to None for API compatibility. 

99 if not prefix: 

100 prefix = None 

101 

102 # Convert the dataset ID gen mode string to enum. 

103 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode] 

104 

105 # Create the butler with the relevant run attached. 

106 butler = Butler(repo, run=run) 

107 

108 datasetType = butler.registry.getDatasetType(dataset_type) 

109 

110 # Convert the k=v strings into a dataId dict. 

111 universe = butler.dimensions 

112 common_data_id = parse_data_id_tuple(data_id, universe) 

113 

114 # Read the table assuming that Astropy can work out the format. 

115 uri = ResourcePath(table_file, forceAbsolute=False) 

116 with uri.as_local() as local_file: 

117 table = Table.read(local_file.ospath) 

118 

119 datasets = extract_datasets_from_table( 

120 table, common_data_id, datasetType, run, formatter, prefix, id_gen_mode 

121 ) 

122 

123 butler.ingest(*datasets, transfer=transfer) 

124 

125 

126def extract_datasets_from_table( 

127 table: Table, 

128 common_data_id: dict, 

129 datasetType: DatasetType, 

130 run: str, 

131 formatter: str | None = None, 

132 prefix: str | None = None, 

133 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

134) -> list[FileDataset]: 

135 """Extract datasets from the supplied table. 

136 

137 Parameters 

138 ---------- 

139 table : `astropy.table.Table` 

140 Table containing the datasets. The first column is assumed to be 

141 the file URI and the remaining columns are dimensions. 

142 common_data_id : `dict` 

143 Data ID values that are common to every row in the table. These 

144 take priority if a dimension in this dataId is also present as 

145 a column in the table. 

146 datasetType : `DatasetType` 

147 The dataset type to be associated with the ingested data. 

148 run : `str` 

149 The name of the run that will be receiving these datasets. 

150 formatter : `str`, optional 

151 Fully-qualified python class name for the `Formatter` to use 

152 to read the ingested files. If `None` the formatter is read from 

153 datastore configuration based on the dataset type. 

154 prefix : `str`, optional 

155 Prefix to be used for relative paths. Can be `None` for current 

156 working directory. 

157 id_generation_mode: `DatasetIdGenEnum`, optional 

158 The mode to use when creating the dataset IDs. 

159 

160 Returns 

161 ------- 

162 datasets : `list` of `FileDataset` 

163 The `FileDataset` objects corresponding to the rows in the table. 

164 The number of elements in this list can be smaller than the number 

165 of rows in the file because one file can appear in multiple rows 

166 with different dataIds. 

167 """ 

168 # The file is the first column and everything else is assumed to 

169 # be dimensions so we need to know the name of that column. 

170 file_column = table.colnames[0] 

171 

172 # Handle multiple dataIds per file by grouping by file. 

173 refs_by_file = defaultdict(list) 

174 n_dataset_refs = 0 

175 for row in table: 

176 # Convert the row to a dataId, remembering to extract the 

177 # path column. 

178 dataId = dict(row) 

179 path = dataId.pop(file_column) 

180 

181 # The command line can override a column. 

182 dataId.update(common_data_id) 

183 

184 # Create the dataset ref that is to be ingested. 

185 ref = DatasetRef(datasetType, dataId, run=run, id_generation_mode=id_generation_mode) # type: ignore 

186 

187 # Convert path to absolute (because otherwise system will 

188 # assume relative to datastore root and that is almost certainly 

189 # never the right default here). 

190 path_uri = ResourcePath(path, root=prefix, forceAbsolute=True) 

191 

192 refs_by_file[path_uri].append(ref) 

193 n_dataset_refs += 1 

194 

195 datasets = [ 

196 FileDataset( 

197 path=file_uri, 

198 refs=refs, 

199 formatter=formatter, 

200 ) 

201 for file_uri, refs in refs_by_file.items() 

202 ] 

203 

204 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets)) 

205 

206 return datasets 

207 

208 

209def parse_data_id_tuple(data_ids: tuple[str, ...], universe: DimensionUniverse) -> dict[str, Any]: 

210 """Convert any additional k=v strings in the dataId tuple to dict 

211 form. 

212 """ 

213 data_id: dict[str, Any] = {} 

214 for id_str in data_ids: 

215 dimension_str, value = id_str.split("=") 

216 

217 try: 

218 dimension = universe.getStaticDimensions()[dimension_str] 

219 except KeyError: 

220 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None 

221 

222 # Cast the value to the right python type (since they will be 

223 # strings at this point). 

224 value = dimension.primaryKey.getPythonType()(value) 

225 

226 data_id[dimension_str] = value 

227 return data_id