Coverage for python/lsst/daf/butler/script/ingest_files.py: 22%
53 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 02:10 -0700
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 02:10 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("ingest_files",)
25import logging
26from collections import defaultdict
27from typing import TYPE_CHECKING, Any
29from astropy.table import Table
30from lsst.resources import ResourcePath
31from lsst.utils import doImport
33from .._butler import Butler
34from ..core import DatasetIdGenEnum, DatasetRef, FileDataset
36if TYPE_CHECKING:
37 from ..core import DatasetType, DimensionUniverse
39log = logging.getLogger(__name__)
42def ingest_files(
43 repo: str,
44 dataset_type: str,
45 run: str,
46 table_file: str,
47 data_id: tuple[str, ...] = (),
48 formatter: str | None = None,
49 id_generation_mode: str = "UNIQUE",
50 prefix: str | None = None,
51 transfer: str = "auto",
52) -> None:
53 """Ingest files from a table.
55 Parameters
56 ----------
57 repo : `str`
58 URI string of the Butler repo to use.
59 dataset_type : `str`
60 The name of the dataset type for the files to be ingested. This
61 dataset type must exist.
62 run : `str`
63 The run in which the files should be ingested.
64 table_file : `str`
65 Path to a table file to read. This file can be in any format that
66 can be read by Astropy so long as Astropy can determine the format
67 itself.
68 data_id : `tuple` of `str`
69 Tuple of strings of the form ``keyword=value`` that can be used
70 to define dataId elements that are fixed for all ingested files
71 found in the table file. This allows those columns to be missing
72 from the table file. Dimensions given here override table columns.
73 formatter : `str`, optional
74 Fully-qualified python class name for the `Formatter` to use
75 to read the ingested files. If `None` the formatter is read from
76 datastore configuration based on the dataset type.
77 id_generation_mode : `str`, optional
78 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`.
79 prefix : `str`, optional
80 Prefix to use when resolving relative paths in table files. The default
81 is to use the current working directory.
82 transfer : `str`, optional
83 Transfer mode to use for ingest.
84 """
86 # Check that the formatter can be imported -- validate this as soon
87 # as possible before we read a potentially large table file.
88 if formatter:
89 doImport(formatter)
90 else:
91 formatter = None
93 # Force empty string prefix (from click) to None for API compatibility.
94 if not prefix:
95 prefix = None
97 # Convert the dataset ID gen mode string to enum.
98 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode]
100 # Create the butler with the relevant run attached.
101 butler = Butler(repo, run=run)
103 datasetType = butler.registry.getDatasetType(dataset_type)
105 # Convert the k=v strings into a dataId dict.
106 universe = butler.registry.dimensions
107 common_data_id = parse_data_id_tuple(data_id, universe)
109 # Read the table assuming that Astropy can work out the format.
110 uri = ResourcePath(table_file, forceAbsolute=False)
111 with uri.as_local() as local_file:
112 table = Table.read(local_file.ospath)
114 datasets = extract_datasets_from_table(
115 table, common_data_id, datasetType, run, formatter, prefix, id_gen_mode
116 )
118 butler.ingest(*datasets, transfer=transfer)
121def extract_datasets_from_table(
122 table: Table,
123 common_data_id: dict,
124 datasetType: DatasetType,
125 run: str,
126 formatter: str | None = None,
127 prefix: str | None = None,
128 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
129) -> list[FileDataset]:
130 """Extract datasets from the supplied table.
132 Parameters
133 ----------
134 table : `astropy.table.Table`
135 Table containing the datasets. The first column is assumed to be
136 the file URI and the remaining columns are dimensions.
137 common_data_id : `dict`
138 Data ID values that are common to every row in the table. These
139 take priority if a dimension in this dataId is also present as
140 a column in the table.
141 datasetType : `DatasetType`
142 The dataset type to be associated with the ingested data.
143 run : `str`
144 The name of the run that will be receiving these datasets.
145 formatter : `str`, optional
146 Fully-qualified python class name for the `Formatter` to use
147 to read the ingested files. If `None` the formatter is read from
148 datastore configuration based on the dataset type.
149 prefix : `str`, optional
150 Prefix to be used for relative paths. Can be `None` for current
151 working directory.
152 id_generation_mode: `DatasetIdGenEnum`, optional
153 The mode to use when creating the dataset IDs.
155 Returns
156 -------
157 datasets : `list` of `FileDataset`
158 The `FileDataset` objects corresponding to the rows in the table.
159 The number of elements in this list can be smaller than the number
160 of rows in the file because one file can appear in multiple rows
161 with different dataIds.
162 """
163 # The file is the first column and everything else is assumed to
164 # be dimensions so we need to know the name of that column.
165 file_column = table.colnames[0]
167 # Handle multiple dataIds per file by grouping by file.
168 refs_by_file = defaultdict(list)
169 n_dataset_refs = 0
170 for row in table:
171 # Convert the row to a dataId, remembering to extract the
172 # path column.
173 dataId = dict(row)
174 path = dataId.pop(file_column)
176 # The command line can override a column.
177 dataId.update(common_data_id)
179 # Create the dataset ref that is to be ingested.
180 ref = DatasetRef(datasetType, dataId, run=run, id_generation_mode=id_generation_mode) # type: ignore
182 # Convert path to absolute (because otherwise system will
183 # assume relative to datastore root and that is almost certainly
184 # never the right default here).
185 path_uri = ResourcePath(path, root=prefix, forceAbsolute=True)
187 refs_by_file[path_uri].append(ref)
188 n_dataset_refs += 1
190 datasets = [
191 FileDataset(
192 path=file_uri,
193 refs=refs,
194 formatter=formatter,
195 )
196 for file_uri, refs in refs_by_file.items()
197 ]
199 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets))
201 return datasets
204def parse_data_id_tuple(data_ids: tuple[str, ...], universe: DimensionUniverse) -> dict[str, Any]:
205 # Convert any additional k=v strings in the dataId tuple to dict
206 # form.
207 data_id: dict[str, Any] = {}
208 for id_str in data_ids:
209 dimension_str, value = id_str.split("=")
211 try:
212 dimension = universe.getStaticDimensions()[dimension_str]
213 except KeyError:
214 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None
216 # Cast the value to the right python type (since they will be
217 # strings at this point).
218 value = dimension.primaryKey.getPythonType()(value)
220 data_id[dimension_str] = value
221 return data_id