Coverage for python/lsst/daf/butler/script/queryDatasets.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from astropy.table import Table as AstropyTable
23from collections import defaultdict, namedtuple
24import numpy as np
26from .. import Butler
27from ..core.utils import globToRegex
29_RefInfo = namedtuple("RefInfo", "datasetRef uri")
32class _Table:
33 """Aggregates rows for a single dataset type, and creates an astropy table
34 with the aggregated data. Eliminates duplicate rows.
36 Parameters
37 ----------
38 columnNames : `list` [`str`]
39 The names of columns.
40 """
42 def __init__(self):
43 self.datasetRefs = set()
45 def add(self, datasetRef, uri=None):
46 """Add a row of information to the table.
48 ``uri`` is optional but must be the consistent; provided or not, for
49 every call to a ``_Table`` instance.
51 Parameters
52 ----------
53 datasetRef : ``DatasetRef``
54 A dataset ref that will be added as a row in the table.
55 uri : ``ButlerURI``, optional
56 The URI to show as a file location in the table, by default None
57 """
58 if uri:
59 uri = str(uri)
60 self.datasetRefs.add(_RefInfo(datasetRef, uri))
62 def getAstropyTable(self, datasetTypeName):
63 """Get the table as an astropy table.
65 Parameters
66 ----------
67 datasetTypeName : `str`
68 The dataset type name to show in the ``type`` column of the table.
70 Returns
71 -------
72 table : `astropy.table._Table`
73 The table with the provided column names and rows.
74 """
75 # Should never happen; adding a dataset should be the action that
76 # causes a _Table to be created.
77 if not self.datasetRefs:
78 raise RuntimeError(f"No DatasetRefs were provided for dataset type {datasetTypeName}")
80 refInfo = next(iter(self.datasetRefs))
81 dimensions = list(refInfo.datasetRef.dataId.full.keys())
82 columnNames = ["type", "run", "id",
83 *[str(item) for item in dimensions]]
85 # Need to hint the column types for numbers since the per-row
86 # constructor of Table does not work this out on its own and sorting
87 # will not work properly without.
88 typeMap = {float: np.float, int: np.int64}
89 columnTypes = [None, None, np.int64,
90 *[typeMap.get(type(value)) for value in refInfo.datasetRef.dataId.full.values()]]
91 if refInfo.uri:
92 columnNames.append("URI")
93 columnTypes.append(None)
95 rows = []
96 for refInfo in self.datasetRefs:
97 row = [datasetTypeName,
98 refInfo.datasetRef.run,
99 refInfo.datasetRef.id,
100 *[value for value in refInfo.datasetRef.dataId.full.values()]]
101 if refInfo.uri:
102 row.append(refInfo.uri)
103 rows.append(row)
105 dataset_table = AstropyTable(np.array(rows), names=columnNames, dtype=columnTypes)
107 # For sorting we want to ignore the id
108 # We also want to move temporal or spatial dimensions earlier
109 sort_first = ["type", "run"]
110 sort_early = []
111 sort_late = []
112 for dim in dimensions:
113 if dim.spatial or dim.temporal:
114 sort_early.extend(dim.required.names)
115 else:
116 sort_late.append(str(dim))
117 sort_keys = sort_first + sort_early + sort_late
119 # The required names above means that we have the possibility of
120 # repeats of sort keys. Now have to remove them
121 # (order is retained by dict creation).
122 sort_keys = list(dict.fromkeys(sort_keys).keys())
124 dataset_table.sort(sort_keys)
125 return dataset_table
128def queryDatasets(repo, glob, collections, where, find_first, show_uri):
129 """Get dataset refs from a repository.
131 Parameters
132 ----------
133 repo : `str`
134 URI to the location of the repo or URI to a config file describing the
135 repo and its location.
136 glob : iterable [`str`]
137 A list of glob-style search string that fully or partially identify
138 the dataset type names to search for.
139 collections : iterable [`str`]
140 A list of glob-style search string that fully or partially identify
141 the collections to search for.
142 where : `str`
143 A string expression similar to a SQL WHERE clause. May involve any
144 column of a dimension table or (as a shortcut for the primary key
145 column of a dimension table) dimension name.
146 find_first : `bool`
147 For each result data ID, only yield one DatasetRef of each DatasetType,
148 from the first collection in which a dataset of that dataset type
149 appears (according to the order of `collections` passed in). If used,
150 `collections` must specify at least one expression and must not contain
151 wildcards.
152 show_uri : `bool`
153 If True, include the dataset URI in the output.
154 Returns
155 -------
156 datasetTables : `list` [``astropy.table._Table``]
157 A list of astropy tables, one for each dataset type.
158 """
159 butler = Butler(repo)
161 dataset = globToRegex(glob)
162 if not dataset:
163 dataset = ...
165 if collections and not find_first:
166 collections = globToRegex(collections)
167 elif not collections:
168 collections = ...
170 datasets = butler.registry.queryDatasets(datasetType=dataset,
171 collections=collections,
172 where=where,
173 findFirst=find_first)
175 tables = defaultdict(_Table)
177 for datasetRef in datasets:
178 if not show_uri:
179 tables[datasetRef.datasetType.name].add(datasetRef)
180 else:
181 primaryURI, componentURIs = butler.getURIs(datasetRef, collections=datasetRef.run)
182 if primaryURI:
183 tables[datasetRef.datasetType.name].add(datasetRef, primaryURI)
184 for name, uri in componentURIs.items():
185 tables[datasetRef.datasetType.componentTypeName(name)].add(datasetRef, uri)
187 return [table.getAstropyTable(datasetTypeName) for datasetTypeName, table in tables.items()]