Coverage for python/lsst/daf/butler/script/queryDatasets.py: 18%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23from astropy.table import Table as AstropyTable
24from collections import defaultdict, namedtuple
25from typing import Dict
26import numpy as np
27import uuid
29from .. import Butler
30from ..cli.utils import sortAstropyTable
33_RefInfo = namedtuple("_RefInfo", ["datasetRef", "uri"])
36class _Table:
37 """Aggregates rows for a single dataset type, and creates an astropy table
38 with the aggregated data. Eliminates duplicate rows.
39 """
41 def __init__(self):
42 self.datasetRefs = set()
44 def add(self, datasetRef, uri=None):
45 """Add a row of information to the table.
47 ``uri`` is optional but must be the consistent; provided or not, for
48 every call to a ``_Table`` instance.
50 Parameters
51 ----------
52 datasetRef : ``DatasetRef``
53 A dataset ref that will be added as a row in the table.
54 uri : ``ButlerURI``, optional
55 The URI to show as a file location in the table, by default None
56 """
57 if uri:
58 uri = str(uri)
59 self.datasetRefs.add(_RefInfo(datasetRef, uri))
61 def getAstropyTable(self, datasetTypeName):
62 """Get the table as an astropy table.
64 Parameters
65 ----------
66 datasetTypeName : `str`
67 The dataset type name to show in the ``type`` column of the table.
69 Returns
70 -------
71 table : `astropy.table._Table`
72 The table with the provided column names and rows.
73 """
74 def _id_type(datasetRef):
75 if isinstance(datasetRef.id, uuid.UUID):
76 return str
77 else:
78 return np.int64
80 # Should never happen; adding a dataset should be the action that
81 # causes a _Table to be created.
82 if not self.datasetRefs:
83 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")
85 refInfo = next(iter(self.datasetRefs))
86 dimensions = list(refInfo.datasetRef.dataId.full.keys())
87 columnNames = ["type", "run", "id",
88 *[str(item) for item in dimensions]]
90 # Need to hint the column types for numbers since the per-row
91 # constructor of Table does not work this out on its own and sorting
92 # will not work properly without.
93 typeMap = {float: np.float64, int: np.int64}
94 idType = _id_type(refInfo.datasetRef)
95 columnTypes = [None, None, idType,
96 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()]]
97 if refInfo.uri:
98 columnNames.append("URI")
99 columnTypes.append(None)
101 rows = []
102 for refInfo in self.datasetRefs:
103 row = [datasetTypeName,
104 refInfo.datasetRef.run,
105 str(refInfo.datasetRef.id) if idType is str else refInfo.datasetRef.id,
106 *[value for value in refInfo.datasetRef.dataId.full.values()]]
107 if refInfo.uri:
108 row.append(refInfo.uri)
109 rows.append(row)
111 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes)
112 return sortAstropyTable(dataset_table, dimensions, ["type", "run"])
115class QueryDatasets:
116 """Get dataset refs from a repository.
118 Parameters
119 ----------
120 repo : `str` or `None`
121 URI to the location of the repo or URI to a config file describing the
122 repo and its location. One of `repo` and `butler` must be `None` and
123 the other must not be `None`.
124 butler : ``lsst.daf.butler.Butler`` or `None`
125 The butler to use to query. One of `repo` and `butler` must be `None`
126 and the other must not be `None`.
127 glob : iterable [`str`]
128 A list of glob-style search string that fully or partially identify
129 the dataset type names to search for.
130 collections : iterable [`str`]
131 A list of glob-style search string that fully or partially identify
132 the collections to search for.
133 where : `str`
134 A string expression similar to a SQL WHERE clause. May involve any
135 column of a dimension table or (as a shortcut for the primary key
136 column of a dimension table) dimension name.
137 find_first : `bool`
138 For each result data ID, only yield one DatasetRef of each DatasetType,
139 from the first collection in which a dataset of that dataset type
140 appears (according to the order of `collections` passed in). If used,
141 `collections` must specify at least one expression and must not contain
142 wildcards.
143 show_uri : `bool`
144 If True, include the dataset URI in the output.
145 """
147 def __init__(self, glob, collections, where, find_first, show_uri, repo=None, butler=None):
148 if (repo and butler) or (not repo and not butler):
149 raise RuntimeError("One of repo and butler must be provided and the other must be None.")
150 self.butler = butler or Butler(repo)
151 self._getDatasets(glob, collections, where, find_first)
152 self.showUri = show_uri
154 def _getDatasets(self, glob, collections, where, find_first):
155 if not glob:
156 glob = ...
157 if not collections:
158 collections = ...
160 self.datasets = self.butler.registry.queryDatasets(datasetType=glob,
161 collections=collections,
162 where=where,
163 findFirst=find_first).expanded()
165 def getTables(self):
166 """Get the datasets as a list of astropy tables.
168 Returns
169 -------
170 datasetTables : `list` [``astropy.table._Table``]
171 A list of astropy tables, one for each dataset type.
172 """
173 tables: Dict[str, _Table] = defaultdict(_Table)
174 for datasetRef in self.datasets:
175 if not self.showUri:
176 tables[datasetRef.datasetType.name].add(datasetRef)
177 else:
178 primaryURI, componentURIs = self.butler.getURIs(datasetRef, collections=datasetRef.run,
179 predict=True)
180 if primaryURI:
181 tables[datasetRef.datasetType.name].add(datasetRef, primaryURI)
182 for name, uri in componentURIs.items():
183 tables[datasetRef.datasetType.componentTypeName(name)].add(datasetRef, uri)
185 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]
187 def getDatasets(self):
188 """Get the datasets as a list of ``DatasetQueryResults``.
190 Returns
191 -------
192 refs : ``queries.DatasetQueryResults``
193 Dataset references matching the given query criteria.
194 """
195 return self.datasets