Coverage for python/lsst/daf/butler/script/queryDatasets.py: 17%
68 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 14:18 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-12-08 14:18 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23import uuid
24from collections import defaultdict, namedtuple
25from typing import Dict
27import numpy as np
28from astropy.table import Table as AstropyTable
30from .._butler import Butler
31from ..cli.utils import sortAstropyTable
33_RefInfo = namedtuple("_RefInfo", ["datasetRef", "uri"])
36class _Table:
37 """Aggregates rows for a single dataset type, and creates an astropy table
38 with the aggregated data. Eliminates duplicate rows.
39 """
41 def __init__(self):
42 self.datasetRefs = set()
44 def add(self, datasetRef, uri=None):
45 """Add a row of information to the table.
47 ``uri`` is optional but must be the consistent; provided or not, for
48 every call to a ``_Table`` instance.
50 Parameters
51 ----------
52 datasetRef : `DatasetRef`
53 A dataset ref that will be added as a row in the table.
54 uri : `lsst.resources.ResourcePath`, optional
55 The URI to show as a file location in the table, by default None
56 """
57 if uri:
58 uri = str(uri)
59 self.datasetRefs.add(_RefInfo(datasetRef, uri))
61 def getAstropyTable(self, datasetTypeName):
62 """Get the table as an astropy table.
64 Parameters
65 ----------
66 datasetTypeName : `str`
67 The dataset type name to show in the ``type`` column of the table.
69 Returns
70 -------
71 table : `astropy.table._Table`
72 The table with the provided column names and rows.
73 """
75 def _id_type(datasetRef):
76 if isinstance(datasetRef.id, uuid.UUID):
77 return str
78 else:
79 return np.int64
81 # Should never happen; adding a dataset should be the action that
82 # causes a _Table to be created.
83 if not self.datasetRefs:
84 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")
86 refInfo = next(iter(self.datasetRefs))
87 dimensions = list(refInfo.datasetRef.dataId.full.keys())
88 columnNames = ["type", "run", "id", *[str(item) for item in dimensions]]
90 # Need to hint the column types for numbers since the per-row
91 # constructor of Table does not work this out on its own and sorting
92 # will not work properly without.
93 typeMap = {float: np.float64, int: np.int64}
94 idType = _id_type(refInfo.datasetRef)
95 columnTypes = [
96 None,
97 None,
98 idType,
99 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()],
100 ]
101 if refInfo.uri:
102 columnNames.append("URI")
103 columnTypes.append(None)
105 rows = []
106 for refInfo in self.datasetRefs:
107 row = [
108 datasetTypeName,
109 refInfo.datasetRef.run,
110 str(refInfo.datasetRef.id) if idType is str else refInfo.datasetRef.id,
111 *[value for value in refInfo.datasetRef.dataId.full.values()],
112 ]
113 if refInfo.uri:
114 row.append(refInfo.uri)
115 rows.append(row)
117 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes)
118 return sortAstropyTable(dataset_table, dimensions, ["type", "run"])
121class QueryDatasets:
122 """Get dataset refs from a repository.
124 Parameters
125 ----------
126 repo : `str` or `None`
127 URI to the location of the repo or URI to a config file describing the
128 repo and its location. One of `repo` and `butler` must be `None` and
129 the other must not be `None`.
130 butler : ``lsst.daf.butler.Butler`` or `None`
131 The butler to use to query. One of `repo` and `butler` must be `None`
132 and the other must not be `None`.
133 glob : iterable [`str`]
134 A list of glob-style search string that fully or partially identify
135 the dataset type names to search for.
136 collections : iterable [`str`]
137 A list of glob-style search string that fully or partially identify
138 the collections to search for.
139 where : `str`
140 A string expression similar to a SQL WHERE clause. May involve any
141 column of a dimension table or (as a shortcut for the primary key
142 column of a dimension table) dimension name.
143 find_first : `bool`
144 For each result data ID, only yield one DatasetRef of each DatasetType,
145 from the first collection in which a dataset of that dataset type
146 appears (according to the order of `collections` passed in). If used,
147 `collections` must specify at least one expression and must not contain
148 wildcards.
149 show_uri : `bool`
150 If True, include the dataset URI in the output.
151 """
153 def __init__(self, glob, collections, where, find_first, show_uri, repo=None, butler=None):
154 if (repo and butler) or (not repo and not butler):
155 raise RuntimeError("One of repo and butler must be provided and the other must be None.")
156 self.butler = butler or Butler(repo)
157 self._getDatasets(glob, collections, where, find_first)
158 self.showUri = show_uri
160 def _getDatasets(self, glob, collections, where, find_first):
161 if not glob:
162 glob = ...
163 if not collections:
164 collections = ...
166 self.datasets = self.butler.registry.queryDatasets(
167 datasetType=glob, collections=collections, where=where, findFirst=find_first
168 ).expanded()
170 def getTables(self):
171 """Get the datasets as a list of astropy tables.
173 Returns
174 -------
175 datasetTables : `list` [``astropy.table._Table``]
176 A list of astropy tables, one for each dataset type.
177 """
178 tables: Dict[str, _Table] = defaultdict(_Table)
179 if not self.showUri:
180 for dataset_ref in self.datasets:
181 tables[dataset_ref.datasetType.name].add(dataset_ref)
182 else:
183 d = list(self.datasets)
184 ref_uris = self.butler.datastore.getManyURIs(d, predict=True)
185 for ref, uris in ref_uris.items():
186 if uris.primaryURI:
187 tables[ref.datasetType.name].add(ref, uris.primaryURI)
188 for name, uri in uris.componentURIs.items():
189 tables[ref.datasetType.componentTypeName(name)].add(ref, uri)
191 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]
193 def getDatasets(self):
194 """Get the datasets as a list of ``DatasetQueryResults``.
196 Returns
197 -------
198 refs : ``queries.DatasetQueryResults``
199 Dataset references matching the given query criteria.
200 """
201 return self.datasets