Coverage for python/lsst/daf/butler/script/queryDatasets.py: 27%
67 statements
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 09:54 +0000
« prev ^ index » next coverage.py v7.5.0, created at 2024-04-30 09:54 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
27from __future__ import annotations
29import dataclasses
30from collections import defaultdict
31from collections.abc import Iterable
32from types import EllipsisType
33from typing import TYPE_CHECKING
35import numpy as np
36from astropy.table import Table as AstropyTable
38from .._butler import Butler
39from ..cli.utils import sortAstropyTable
41if TYPE_CHECKING:
42 from lsst.daf.butler import DatasetRef
43 from lsst.daf.butler.registry.queries import DatasetQueryResults
44 from lsst.resources import ResourcePath
47@dataclasses.dataclass(frozen=True)
48class _RefInfo:
49 datasetRef: DatasetRef
50 uri: str | None
53class _Table:
54 """Aggregates rows for a single dataset type, and creates an astropy table
55 with the aggregated data. Eliminates duplicate rows.
56 """
58 datasetRefs: set[_RefInfo]
60 def __init__(self) -> None:
61 self.datasetRefs = set()
63 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None:
64 """Add a row of information to the table.
66 ``uri`` is optional but must be the consistent; provided or not, for
67 every call to a ``_Table`` instance.
69 Parameters
70 ----------
71 datasetRef : `DatasetRef`
72 A dataset ref that will be added as a row in the table.
73 uri : `lsst.resources.ResourcePath`, optional
74 The URI to show as a file location in the table, by default `None`.
75 """
76 uri_str = str(uri) if uri else None
77 self.datasetRefs.add(_RefInfo(datasetRef, uri_str))
79 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable:
80 """Get the table as an astropy table.
82 Parameters
83 ----------
84 datasetTypeName : `str`
85 The dataset type name to show in the ``type`` column of the table.
87 Returns
88 -------
89 table : `astropy.table._Table`
90 The table with the provided column names and rows.
91 """
92 # Should never happen; adding a dataset should be the action that
93 # causes a _Table to be created.
94 if not self.datasetRefs:
95 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")
97 refInfo = next(iter(self.datasetRefs))
98 dimensions = [
99 refInfo.datasetRef.dataId.universe.dimensions[k]
100 for k in refInfo.datasetRef.dataId.dimensions.data_coordinate_keys
101 ]
102 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]]
104 # Need to hint the column types for numbers since the per-row
105 # constructor of Table does not work this out on its own and sorting
106 # will not work properly without.
107 typeMap = {float: np.float64, int: np.int64}
108 columnTypes = [
109 None,
110 None,
111 str,
112 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full_values],
113 ]
114 if refInfo.uri:
115 columnNames.append("URI")
116 columnTypes.append(None)
118 rows = []
119 for refInfo in self.datasetRefs:
120 row = [
121 datasetTypeName,
122 refInfo.datasetRef.run,
123 str(refInfo.datasetRef.id),
124 *refInfo.datasetRef.dataId.full_values,
125 ]
126 if refInfo.uri:
127 row.append(refInfo.uri)
128 rows.append(row)
130 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes)
131 return sortAstropyTable(dataset_table, dimensions, ["type", "run"])
134class QueryDatasets:
135 """Get dataset refs from a repository.
137 Parameters
138 ----------
139 glob : iterable [`str`]
140 A list of glob-style search string that fully or partially identify
141 the dataset type names to search for.
142 collections : iterable [`str`]
143 A list of glob-style search string that fully or partially identify
144 the collections to search for.
145 where : `str`
146 A string expression similar to a SQL WHERE clause. May involve any
147 column of a dimension table or (as a shortcut for the primary key
148 column of a dimension table) dimension name.
149 find_first : `bool`
150 For each result data ID, only yield one DatasetRef of each DatasetType,
151 from the first collection in which a dataset of that dataset type
152 appears (according to the order of `collections` passed in). If used,
153 `collections` must specify at least one expression and must not contain
154 wildcards.
155 show_uri : `bool`
156 If True, include the dataset URI in the output.
157 repo : `str` or `None`
158 URI to the location of the repo or URI to a config file describing the
159 repo and its location. One of `repo` and `butler` must be `None` and
160 the other must not be `None`.
161 butler : `lsst.daf.butler.Butler` or `None`
162 The butler to use to query. One of `repo` and `butler` must be `None`
163 and the other must not be `None`.
164 """
166 def __init__(
167 self,
168 glob: Iterable[str],
169 collections: Iterable[str],
170 where: str,
171 find_first: bool,
172 show_uri: bool,
173 repo: str | None = None,
174 butler: Butler | None = None,
175 ):
176 if (repo and butler) or (not repo and not butler):
177 raise RuntimeError("One of repo and butler must be provided and the other must be None.")
178 # show_uri requires a datastore.
179 without_datastore = not show_uri
180 self.butler = butler or Butler.from_config(repo, without_datastore=without_datastore)
181 self._getDatasets(glob, collections, where, find_first)
182 self.showUri = show_uri
184 def _getDatasets(
185 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool
186 ) -> None:
187 datasetTypes = glob or ...
188 query_collections: Iterable[str] | EllipsisType = collections or ...
190 self.datasets = self.butler.registry.queryDatasets(
191 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first
192 ).expanded()
194 def getTables(self) -> list[AstropyTable]:
195 """Get the datasets as a list of astropy tables.
197 Returns
198 -------
199 datasetTables : `list` [``astropy.table._Table``]
200 A list of astropy tables, one for each dataset type.
201 """
202 tables: dict[str, _Table] = defaultdict(_Table)
203 if not self.showUri:
204 for dataset_ref in self.datasets:
205 tables[dataset_ref.datasetType.name].add(dataset_ref)
206 else:
207 d = list(self.datasets)
208 ref_uris = self.butler.get_many_uris(d, predict=True)
209 for ref, uris in ref_uris.items():
210 if uris.primaryURI:
211 tables[ref.datasetType.name].add(ref, uris.primaryURI)
212 for name, uri in uris.componentURIs.items():
213 tables[ref.datasetType.componentTypeName(name)].add(ref, uri)
215 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]
217 def getDatasets(self) -> DatasetQueryResults:
218 """Get the datasets as a list of ``DatasetQueryResults``.
220 Returns
221 -------
222 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults`
223 Dataset references matching the given query criteria.
224 """
225 return self.datasets