Coverage for python/lsst/daf/butler/script/queryDatasets.py: 27%
66 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23import dataclasses
24from collections import defaultdict
25from collections.abc import Iterable
26from types import EllipsisType
27from typing import TYPE_CHECKING
29import numpy as np
30from astropy.table import Table as AstropyTable
32from .._butler import Butler
33from ..cli.utils import sortAstropyTable
35if TYPE_CHECKING:
36 from lsst.daf.butler import DatasetRef
37 from lsst.daf.butler.registry.queries import DatasetQueryResults
38 from lsst.resources import ResourcePath
41@dataclasses.dataclass(frozen=True)
42class _RefInfo:
43 datasetRef: DatasetRef
44 uri: str | None
47class _Table:
48 """Aggregates rows for a single dataset type, and creates an astropy table
49 with the aggregated data. Eliminates duplicate rows.
50 """
52 datasetRefs: set[_RefInfo]
54 def __init__(self) -> None:
55 self.datasetRefs = set()
57 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None:
58 """Add a row of information to the table.
60 ``uri`` is optional but must be the consistent; provided or not, for
61 every call to a ``_Table`` instance.
63 Parameters
64 ----------
65 datasetRef : `DatasetRef`
66 A dataset ref that will be added as a row in the table.
67 uri : `lsst.resources.ResourcePath`, optional
68 The URI to show as a file location in the table, by default None
69 """
70 uri_str = str(uri) if uri else None
71 self.datasetRefs.add(_RefInfo(datasetRef, uri_str))
73 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable:
74 """Get the table as an astropy table.
76 Parameters
77 ----------
78 datasetTypeName : `str`
79 The dataset type name to show in the ``type`` column of the table.
81 Returns
82 -------
83 table : `astropy.table._Table`
84 The table with the provided column names and rows.
85 """
86 # Should never happen; adding a dataset should be the action that
87 # causes a _Table to be created.
88 if not self.datasetRefs:
89 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")
91 refInfo = next(iter(self.datasetRefs))
92 dimensions = list(refInfo.datasetRef.dataId.full.keys())
93 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]]
95 # Need to hint the column types for numbers since the per-row
96 # constructor of Table does not work this out on its own and sorting
97 # will not work properly without.
98 typeMap = {float: np.float64, int: np.int64}
99 columnTypes = [
100 None,
101 None,
102 str,
103 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()],
104 ]
105 if refInfo.uri:
106 columnNames.append("URI")
107 columnTypes.append(None)
109 rows = []
110 for refInfo in self.datasetRefs:
111 row = [
112 datasetTypeName,
113 refInfo.datasetRef.run,
114 str(refInfo.datasetRef.id),
115 *[value for value in refInfo.datasetRef.dataId.full.values()],
116 ]
117 if refInfo.uri:
118 row.append(refInfo.uri)
119 rows.append(row)
121 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes)
122 return sortAstropyTable(dataset_table, dimensions, ["type", "run"])
125class QueryDatasets:
126 """Get dataset refs from a repository.
128 Parameters
129 ----------
130 glob : iterable [`str`]
131 A list of glob-style search string that fully or partially identify
132 the dataset type names to search for.
133 collections : iterable [`str`]
134 A list of glob-style search string that fully or partially identify
135 the collections to search for.
136 where : `str`
137 A string expression similar to a SQL WHERE clause. May involve any
138 column of a dimension table or (as a shortcut for the primary key
139 column of a dimension table) dimension name.
140 find_first : `bool`
141 For each result data ID, only yield one DatasetRef of each DatasetType,
142 from the first collection in which a dataset of that dataset type
143 appears (according to the order of `collections` passed in). If used,
144 `collections` must specify at least one expression and must not contain
145 wildcards.
146 show_uri : `bool`
147 If True, include the dataset URI in the output.
148 repo : `str` or `None`
149 URI to the location of the repo or URI to a config file describing the
150 repo and its location. One of `repo` and `butler` must be `None` and
151 the other must not be `None`.
152 butler : `lsst.daf.butler.Butler` or `None`
153 The butler to use to query. One of `repo` and `butler` must be `None`
154 and the other must not be `None`.
156 """
158 def __init__(
159 self,
160 glob: Iterable[str],
161 collections: Iterable[str],
162 where: str,
163 find_first: bool,
164 show_uri: bool,
165 repo: str | None = None,
166 butler: Butler | None = None,
167 ):
168 if (repo and butler) or (not repo and not butler):
169 raise RuntimeError("One of repo and butler must be provided and the other must be None.")
170 self.butler = butler or Butler(repo)
171 self._getDatasets(glob, collections, where, find_first)
172 self.showUri = show_uri
174 def _getDatasets(
175 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool
176 ) -> None:
177 datasetTypes = glob if glob else ...
178 query_collections: Iterable[str] | EllipsisType = collections if collections else ...
180 self.datasets = self.butler.registry.queryDatasets(
181 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first
182 ).expanded()
184 def getTables(self) -> list[AstropyTable]:
185 """Get the datasets as a list of astropy tables.
187 Returns
188 -------
189 datasetTables : `list` [``astropy.table._Table``]
190 A list of astropy tables, one for each dataset type.
191 """
192 tables: dict[str, _Table] = defaultdict(_Table)
193 if not self.showUri:
194 for dataset_ref in self.datasets:
195 tables[dataset_ref.datasetType.name].add(dataset_ref)
196 else:
197 d = list(self.datasets)
198 ref_uris = self.butler.datastore.getManyURIs(d, predict=True)
199 for ref, uris in ref_uris.items():
200 if uris.primaryURI:
201 tables[ref.datasetType.name].add(ref, uris.primaryURI)
202 for name, uri in uris.componentURIs.items():
203 tables[ref.datasetType.componentTypeName(name)].add(ref, uri)
205 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]
207 def getDatasets(self) -> DatasetQueryResults:
208 """Get the datasets as a list of ``DatasetQueryResults``.
210 Returns
211 -------
212 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults`
213 Dataset references matching the given query criteria.
214 """
215 return self.datasets