Coverage for python/lsst/daf/butler/script/ingest_files.py: 26%
54 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-23 02:26 -0700
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-23 02:26 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("ingest_files",)
25import logging
26from collections import defaultdict
27from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
29from astropy.table import Table
30from lsst.resources import ResourcePath
31from lsst.utils import doImport
33from .._butler import Butler
34from ..core import DatasetRef, FileDataset
35from ..registry import DatasetIdGenEnum
37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38, because the condition on line 37 was never true
38 from ..core import DatasetType, DimensionUniverse
40log = logging.getLogger(__name__)
43def ingest_files(
44 repo: str,
45 dataset_type: str,
46 run: str,
47 table_file: str,
48 data_id: Tuple[str, ...] = (),
49 formatter: Optional[str] = None,
50 id_generation_mode: str = "UNIQUE",
51 prefix: Optional[str] = None,
52 transfer: str = "auto",
53) -> None:
54 """Ingest files from a table.
56 Parameters
57 ----------
58 repo : `str`
59 URI string of the Butler repo to use.
60 dataset_type : `str`
61 The name of the dataset type for the files to be ingested. This
62 dataset type must exist.
63 run : `str`
64 The run in which the files should be ingested.
65 table_file : `str`
66 Path to a table file to read. This file can be in any format that
67 can be read by Astropy so long as Astropy can determine the format
68 itself.
69 data_id : `tuple` of `str`
70 Tuple of strings of the form ``keyword=value`` that can be used
71 to define dataId elements that are fixed for all ingested files
72 found in the table file. This allows those columns to be missing
73 from the table file. Dimensions given here override table columns.
74 formatter : `str`, optional
75 Fully-qualified python class name for the `Formatter` to use
76 to read the ingested files. If `None` the formatter is read from
77 datastore configuration based on the dataset type.
78 id_generation_mode : `str`, optional
79 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`.
80 prefix : `str`, optional
81 Prefix to use when resolving relative paths in table files. The default
82 is to use the current working directory.
83 transfer : `str`, optional
84 Transfer mode to use for ingest.
85 """
87 # Check that the formatter can be imported -- validate this as soon
88 # as possible before we read a potentially large table file.
89 if formatter:
90 doImport(formatter)
91 else:
92 formatter = None
94 # Force empty string prefix (from click) to None for API compatibility.
95 if not prefix:
96 prefix = None
98 # Convert the dataset ID gen mode string to enum.
99 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode]
101 # Create the butler with the relevant run attached.
102 butler = Butler(repo, run=run)
104 datasetType = butler.registry.getDatasetType(dataset_type)
106 # Convert the k=v strings into a dataId dict.
107 universe = butler.registry.dimensions
108 common_data_id = parse_data_id_tuple(data_id, universe)
110 # Read the table assuming that Astropy can work out the format.
111 table = Table.read(table_file)
113 datasets = extract_datasets_from_table(table, common_data_id, datasetType, formatter, prefix)
115 butler.ingest(*datasets, transfer=transfer, run=run, idGenerationMode=id_gen_mode)
118def extract_datasets_from_table(
119 table: Table,
120 common_data_id: Dict,
121 datasetType: DatasetType,
122 formatter: Optional[str] = None,
123 prefix: Optional[str] = None,
124) -> List[FileDataset]:
125 """Extract datasets from the supplied table.
127 Parameters
128 ----------
129 table : `astropy.table.Table`
130 Table containing the datasets. The first column is assumed to be
131 the file URI and the remaining columns are dimensions.
132 common_data_id : `dict`
133 Data ID values that are common to every row in the table. These
134 take priority if a dimension in this dataId is also present as
135 a column in the table.
136 datasetType : `DatasetType`
137 The dataset type to be associated with the ingested data.
138 formatter : `str`, optional
139 Fully-qualified python class name for the `Formatter` to use
140 to read the ingested files. If `None` the formatter is read from
141 datastore configuration based on the dataset type.
142 prefix : `str`
143 Prefix to be used for relative paths. Can be `None` for current
144 working directory.
146 Returns
147 -------
148 datasets : `list` of `FileDataset`
149 The `FileDataset` objects corresponding to the rows in the table.
150 The number of elements in this list can be smaller than the number
151 of rows in the file because one file can appear in multiple rows
152 with different dataIds.
153 """
154 # The file is the first column and everything else is assumed to
155 # be dimensions so we need to know the name of that column.
156 file_column = table.colnames[0]
158 # Handle multiple dataIds per file by grouping by file.
159 refs_by_file = defaultdict(list)
160 n_dataset_refs = 0
161 for row in table:
163 # Convert the row to a dataId, remembering to extract the
164 # path column.
165 dataId = dict(row)
166 path = dataId.pop(file_column)
168 # The command line can override a column.
169 dataId.update(common_data_id)
171 # Create the dataset ref that is to be ingested.
172 ref = DatasetRef(datasetType, dataId) # type: ignore
174 # Convert path to absolute (because otherwise system will
175 # assume relative to datastore root and that is almost certainly
176 # never the right default here).
177 path_uri = ResourcePath(path, root=prefix, forceAbsolute=True)
179 refs_by_file[path_uri].append(ref)
180 n_dataset_refs += 1
182 datasets = [
183 FileDataset(
184 path=file_uri,
185 refs=refs,
186 formatter=formatter,
187 )
188 for file_uri, refs in refs_by_file.items()
189 ]
191 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets))
193 return datasets
196def parse_data_id_tuple(data_ids: Tuple[str, ...], universe: DimensionUniverse) -> Dict[str, Any]:
197 # Convert any additional k=v strings in the dataId tuple to dict
198 # form.
199 data_id: Dict[str, Any] = {}
200 for id_str in data_ids:
201 dimension_str, value = id_str.split("=")
203 try:
204 dimension = universe.getStaticDimensions()[dimension_str]
205 except KeyError:
206 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None
208 # Cast the value to the right python type (since they will be
209 # strings at this point).
210 value = dimension.primaryKey.getPythonType()(value)
212 data_id[dimension_str] = value
213 return data_id