Coverage for python/lsst/daf/butler/script/ingest_files.py: 22%
53 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29__all__ = ("ingest_files",)
31import logging
32from collections import defaultdict
33from typing import TYPE_CHECKING, Any
35from astropy.table import Table
36from lsst.resources import ResourcePath
37from lsst.utils import doImport
39from .._butler import Butler
40from ..core import DatasetIdGenEnum, DatasetRef, FileDataset
42if TYPE_CHECKING:
43 from ..core import DatasetType, DimensionUniverse
45log = logging.getLogger(__name__)
48def ingest_files(
49 repo: str,
50 dataset_type: str,
51 run: str,
52 table_file: str,
53 data_id: tuple[str, ...] = (),
54 formatter: str | None = None,
55 id_generation_mode: str = "UNIQUE",
56 prefix: str | None = None,
57 transfer: str = "auto",
58) -> None:
59 """Ingest files from a table.
61 Parameters
62 ----------
63 repo : `str`
64 URI string of the Butler repo to use.
65 dataset_type : `str`
66 The name of the dataset type for the files to be ingested. This
67 dataset type must exist.
68 run : `str`
69 The run in which the files should be ingested.
70 table_file : `str`
71 Path to a table file to read. This file can be in any format that
72 can be read by Astropy so long as Astropy can determine the format
73 itself.
74 data_id : `tuple` of `str`
75 Tuple of strings of the form ``keyword=value`` that can be used
76 to define dataId elements that are fixed for all ingested files
77 found in the table file. This allows those columns to be missing
78 from the table file. Dimensions given here override table columns.
79 formatter : `str`, optional
80 Fully-qualified python class name for the `Formatter` to use
81 to read the ingested files. If `None` the formatter is read from
82 datastore configuration based on the dataset type.
83 id_generation_mode : `str`, optional
84 Mode to use for generating IDs. Should map to `DatasetGenIdEnum`.
85 prefix : `str`, optional
86 Prefix to use when resolving relative paths in table files. The default
87 is to use the current working directory.
88 transfer : `str`, optional
89 Transfer mode to use for ingest.
90 """
91 # Check that the formatter can be imported -- validate this as soon
92 # as possible before we read a potentially large table file.
93 if formatter:
94 doImport(formatter)
95 else:
96 formatter = None
98 # Force empty string prefix (from click) to None for API compatibility.
99 if not prefix:
100 prefix = None
102 # Convert the dataset ID gen mode string to enum.
103 id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode]
105 # Create the butler with the relevant run attached.
106 butler = Butler(repo, run=run)
108 datasetType = butler.registry.getDatasetType(dataset_type)
110 # Convert the k=v strings into a dataId dict.
111 universe = butler.dimensions
112 common_data_id = parse_data_id_tuple(data_id, universe)
114 # Read the table assuming that Astropy can work out the format.
115 uri = ResourcePath(table_file, forceAbsolute=False)
116 with uri.as_local() as local_file:
117 table = Table.read(local_file.ospath)
119 datasets = extract_datasets_from_table(
120 table, common_data_id, datasetType, run, formatter, prefix, id_gen_mode
121 )
123 butler.ingest(*datasets, transfer=transfer)
126def extract_datasets_from_table(
127 table: Table,
128 common_data_id: dict,
129 datasetType: DatasetType,
130 run: str,
131 formatter: str | None = None,
132 prefix: str | None = None,
133 id_generation_mode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
134) -> list[FileDataset]:
135 """Extract datasets from the supplied table.
137 Parameters
138 ----------
139 table : `astropy.table.Table`
140 Table containing the datasets. The first column is assumed to be
141 the file URI and the remaining columns are dimensions.
142 common_data_id : `dict`
143 Data ID values that are common to every row in the table. These
144 take priority if a dimension in this dataId is also present as
145 a column in the table.
146 datasetType : `DatasetType`
147 The dataset type to be associated with the ingested data.
148 run : `str`
149 The name of the run that will be receiving these datasets.
150 formatter : `str`, optional
151 Fully-qualified python class name for the `Formatter` to use
152 to read the ingested files. If `None` the formatter is read from
153 datastore configuration based on the dataset type.
154 prefix : `str`, optional
155 Prefix to be used for relative paths. Can be `None` for current
156 working directory.
157 id_generation_mode: `DatasetIdGenEnum`, optional
158 The mode to use when creating the dataset IDs.
160 Returns
161 -------
162 datasets : `list` of `FileDataset`
163 The `FileDataset` objects corresponding to the rows in the table.
164 The number of elements in this list can be smaller than the number
165 of rows in the file because one file can appear in multiple rows
166 with different dataIds.
167 """
168 # The file is the first column and everything else is assumed to
169 # be dimensions so we need to know the name of that column.
170 file_column = table.colnames[0]
172 # Handle multiple dataIds per file by grouping by file.
173 refs_by_file = defaultdict(list)
174 n_dataset_refs = 0
175 for row in table:
176 # Convert the row to a dataId, remembering to extract the
177 # path column.
178 dataId = dict(row)
179 path = dataId.pop(file_column)
181 # The command line can override a column.
182 dataId.update(common_data_id)
184 # Create the dataset ref that is to be ingested.
185 ref = DatasetRef(datasetType, dataId, run=run, id_generation_mode=id_generation_mode) # type: ignore
187 # Convert path to absolute (because otherwise system will
188 # assume relative to datastore root and that is almost certainly
189 # never the right default here).
190 path_uri = ResourcePath(path, root=prefix, forceAbsolute=True)
192 refs_by_file[path_uri].append(ref)
193 n_dataset_refs += 1
195 datasets = [
196 FileDataset(
197 path=file_uri,
198 refs=refs,
199 formatter=formatter,
200 )
201 for file_uri, refs in refs_by_file.items()
202 ]
204 log.info("Ingesting %d dataset ref(s) from %d file(s)", n_dataset_refs, len(datasets))
206 return datasets
209def parse_data_id_tuple(data_ids: tuple[str, ...], universe: DimensionUniverse) -> dict[str, Any]:
210 """Convert any additional k=v strings in the dataId tuple to dict
211 form.
212 """
213 data_id: dict[str, Any] = {}
214 for id_str in data_ids:
215 dimension_str, value = id_str.split("=")
217 try:
218 dimension = universe.getStaticDimensions()[dimension_str]
219 except KeyError:
220 raise ValueError(f"DataID dimension '{dimension_str}' is not known to this universe.") from None
222 # Cast the value to the right python type (since they will be
223 # strings at this point).
224 value = dimension.primaryKey.getPythonType()(value)
226 data_id[dimension_str] = value
227 return data_id