Coverage for python/lsst/daf/butler/script/queryDatasets.py: 26%
75 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-07 10:08 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-01-07 10:08 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23import dataclasses
24import uuid
25from collections import defaultdict
26from collections.abc import Iterable
27from typing import TYPE_CHECKING
29import numpy as np
30from astropy.table import Table as AstropyTable
32from .._butler import Butler
33from ..cli.utils import sortAstropyTable
35if TYPE_CHECKING: 35 ↛ 36line 35 didn't jump to line 36, because the condition on line 35 was never true
36 from lsst.daf.butler import DatasetRef
37 from lsst.daf.butler.registry.queries import DatasetQueryResults
38 from lsst.resources import ResourcePath
41@dataclasses.dataclass(frozen=True)
42class _RefInfo:
43 datasetRef: DatasetRef
44 uri: str | None
47class _Table:
48 """Aggregates rows for a single dataset type, and creates an astropy table
49 with the aggregated data. Eliminates duplicate rows.
50 """
52 datasetRefs: set[_RefInfo]
54 def __init__(self) -> None:
55 self.datasetRefs = set()
57 def add(self, datasetRef: DatasetRef, uri: ResourcePath | None = None) -> None:
58 """Add a row of information to the table.
60 ``uri`` is optional but must be the consistent; provided or not, for
61 every call to a ``_Table`` instance.
63 Parameters
64 ----------
65 datasetRef : `DatasetRef`
66 A dataset ref that will be added as a row in the table.
67 uri : `lsst.resources.ResourcePath`, optional
68 The URI to show as a file location in the table, by default None
69 """
70 uri_str = str(uri) if uri else None
71 self.datasetRefs.add(_RefInfo(datasetRef, uri_str))
73 def getAstropyTable(self, datasetTypeName: str) -> AstropyTable:
74 """Get the table as an astropy table.
76 Parameters
77 ----------
78 datasetTypeName : `str`
79 The dataset type name to show in the ``type`` column of the table.
81 Returns
82 -------
83 table : `astropy.table._Table`
84 The table with the provided column names and rows.
85 """
87 def _id_type(datasetRef: DatasetRef) -> type[str] | type[np.int64]:
88 if isinstance(datasetRef.id, uuid.UUID):
89 return str
90 else:
91 return np.int64
93 # Should never happen; adding a dataset should be the action that
94 # causes a _Table to be created.
95 if not self.datasetRefs:
96 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")
98 refInfo = next(iter(self.datasetRefs))
99 dimensions = list(refInfo.datasetRef.dataId.full.keys())
100 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]]
102 # Need to hint the column types for numbers since the per-row
103 # constructor of Table does not work this out on its own and sorting
104 # will not work properly without.
105 typeMap = {float: np.float64, int: np.int64}
106 idType = _id_type(refInfo.datasetRef)
107 columnTypes = [
108 None,
109 None,
110 idType,
111 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()],
112 ]
113 if refInfo.uri:
114 columnNames.append("URI")
115 columnTypes.append(None)
117 rows = []
118 for refInfo in self.datasetRefs:
119 row = [
120 datasetTypeName,
121 refInfo.datasetRef.run,
122 str(refInfo.datasetRef.id) if idType is str else refInfo.datasetRef.id,
123 *[value for value in refInfo.datasetRef.dataId.full.values()],
124 ]
125 if refInfo.uri:
126 row.append(refInfo.uri)
127 rows.append(row)
129 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes)
130 return sortAstropyTable(dataset_table, dimensions, ["type", "run"])
133class QueryDatasets:
134 """Get dataset refs from a repository.
136 Parameters
137 ----------
138 glob : iterable [`str`]
139 A list of glob-style search string that fully or partially identify
140 the dataset type names to search for.
141 collections : iterable [`str`]
142 A list of glob-style search string that fully or partially identify
143 the collections to search for.
144 where : `str`
145 A string expression similar to a SQL WHERE clause. May involve any
146 column of a dimension table or (as a shortcut for the primary key
147 column of a dimension table) dimension name.
148 find_first : `bool`
149 For each result data ID, only yield one DatasetRef of each DatasetType,
150 from the first collection in which a dataset of that dataset type
151 appears (according to the order of `collections` passed in). If used,
152 `collections` must specify at least one expression and must not contain
153 wildcards.
154 show_uri : `bool`
155 If True, include the dataset URI in the output.
156 repo : `str` or `None`
157 URI to the location of the repo or URI to a config file describing the
158 repo and its location. One of `repo` and `butler` must be `None` and
159 the other must not be `None`.
160 butler : `lsst.daf.butler.Butler` or `None`
161 The butler to use to query. One of `repo` and `butler` must be `None`
162 and the other must not be `None`.
164 """
166 def __init__(
167 self,
168 glob: Iterable[str],
169 collections: Iterable[str],
170 where: str,
171 find_first: bool,
172 show_uri: bool,
173 repo: str | None = None,
174 butler: Butler | None = None,
175 ):
176 if (repo and butler) or (not repo and not butler):
177 raise RuntimeError("One of repo and butler must be provided and the other must be None.")
178 self.butler = butler or Butler(repo)
179 self._getDatasets(glob, collections, where, find_first)
180 self.showUri = show_uri
182 def _getDatasets(
183 self, glob: Iterable[str], collections: Iterable[str], where: str, find_first: bool
184 ) -> None:
185 datasetTypes = glob if glob else ...
186 query_collections = collections if collections else ...
188 self.datasets = self.butler.registry.queryDatasets(
189 datasetType=datasetTypes, collections=query_collections, where=where, findFirst=find_first
190 ).expanded()
192 def getTables(self) -> list[AstropyTable]:
193 """Get the datasets as a list of astropy tables.
195 Returns
196 -------
197 datasetTables : `list` [``astropy.table._Table``]
198 A list of astropy tables, one for each dataset type.
199 """
200 tables: dict[str, _Table] = defaultdict(_Table)
201 if not self.showUri:
202 for dataset_ref in self.datasets:
203 tables[dataset_ref.datasetType.name].add(dataset_ref)
204 else:
205 d = list(self.datasets)
206 ref_uris = self.butler.datastore.getManyURIs(d, predict=True)
207 for ref, uris in ref_uris.items():
208 if uris.primaryURI:
209 tables[ref.datasetType.name].add(ref, uris.primaryURI)
210 for name, uri in uris.componentURIs.items():
211 tables[ref.datasetType.componentTypeName(name)].add(ref, uri)
213 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]
215 def getDatasets(self) -> DatasetQueryResults:
216 """Get the datasets as a list of ``DatasetQueryResults``.
218 Returns
219 -------
220 refs : `lsst.daf.butler.registry.queries.DatasetQueryResults`
221 Dataset references matching the given query criteria.
222 """
223 return self.datasets