Coverage for python/lsst/daf/butler/registry/_collection_summary.py: 32%

72 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-10 10:14 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27from __future__ import annotations 

28 

29__all__ = ("CollectionSummary", "SerializedCollectionSummary") 

30 

31import dataclasses 

32from collections.abc import Generator, Iterable, Mapping, Set 

33from typing import cast 

34 

35import pydantic 

36 

37from .._dataset_ref import DatasetRef 

38from .._dataset_type import DatasetType, SerializedDatasetType 

39from .._named import NamedValueSet 

40from ..dimensions import DataCoordinate, DimensionUniverse 

41 

42 

43@dataclasses.dataclass 

44class CollectionSummary: 

45 """A summary of the datasets that can be found in a collection.""" 

46 

47 def copy(self) -> CollectionSummary: 

48 """Return a deep copy of this object. 

49 

50 Returns 

51 ------- 

52 copy : `CollectionSummary` 

53 A copy of ``self`` that can be modified without modifying ``self`` 

54 at all. 

55 """ 

56 return CollectionSummary( 

57 dataset_types=self.dataset_types.copy(), governors=_copy_governors(self.governors) 

58 ) 

59 

60 def add_datasets_generator(self, refs: Iterable[DatasetRef]) -> Generator[DatasetRef, None, None]: 

61 """Include the given datasets in the summary, yielding them back as a 

62 generator. 

63 

64 Parameters 

65 ---------- 

66 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

67 Datasets to include. 

68 

69 Yields 

70 ------ 

71 ref : `DatasetRef` 

72 The same dataset references originally passed in. 

73 

74 Notes 

75 ----- 

76 As a generator, this method does nothing if its return iterator is not 

77 used. Call `add_datasets` instead to avoid this; this method is 

78 intended for the case where the given iterable may be single-pass and a 

79 copy is not desired, but other processing needs to be done on its 

80 elements. 

81 """ 

82 for ref in refs: 

83 self.dataset_types.add(ref.datasetType) 

84 for gov in ref.dataId.dimensions.governors: 

85 self.governors.setdefault(gov, set()).add(cast(str, ref.dataId[gov])) 

86 yield ref 

87 

88 def add_datasets(self, refs: Iterable[DatasetRef]) -> None: 

89 """Include the given datasets in the summary. 

90 

91 Parameters 

92 ---------- 

93 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

94 Datasets to include. 

95 """ 

96 for _ in self.add_datasets_generator(refs): 

97 pass 

98 

99 def add_data_ids_generator( 

100 self, dataset_type: DatasetType, data_ids: Iterable[DataCoordinate] 

101 ) -> Generator[DataCoordinate, None, None]: 

102 """Include the given dataset type and data IDs in the summary, yielding 

103 them back as a generator. 

104 

105 Parameters 

106 ---------- 

107 dataset_type : `DatasetType` 

108 Dataset type to include. 

109 data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ] 

110 Data IDs to include. 

111 

112 Yields 

113 ------ 

114 data_id : `DataCoordinate` 

115 The same data IDs originally passed in. 

116 

117 Notes 

118 ----- 

119 As a generator, this method does nothing if its return iterator is not 

120 used. Call `add_data_ids` instead to avoid this; this method is 

121 intended for the case where the given iterable may be single-pass and a 

122 copy is not desired, but other processing needs to be done on its 

123 elements. 

124 """ 

125 self.dataset_types.add(dataset_type) 

126 for data_id in data_ids: 

127 for gov in data_id.dimensions.governors: 

128 self.governors.setdefault(gov, set()).add(cast(str, data_id[gov])) 

129 yield data_id 

130 

131 def add_data_ids(self, dataset_type: DatasetType, data_ids: Iterable[DataCoordinate]) -> None: 

132 """Include the given dataset type and data IDs in the summary. 

133 

134 Parameters 

135 ---------- 

136 dataset_type : `DatasetType` 

137 Dataset type to include. 

138 data_ids : `~collections.abc.Iterable` [ `DataCoordinate` ] 

139 Data IDs to include. 

140 """ 

141 for _ in self.add_data_ids_generator(dataset_type, data_ids): 

142 pass 

143 

144 def update(self, *args: CollectionSummary) -> None: 

145 """Update this summary with dataset types and governor dimension values 

146 from other summaries. 

147 

148 Parameters 

149 ---------- 

150 *args : `CollectionSummary` 

151 Summaries to include in ``self``. 

152 """ 

153 for arg in args: 

154 self.dataset_types.update(arg.dataset_types) 

155 for gov, values in arg.governors.items(): 

156 self.governors.setdefault(gov, set()).update(values) 

157 

158 def union(*args: CollectionSummary) -> CollectionSummary: 

159 """Construct a summary that contains all dataset types and governor 

160 dimension values in any of the inputs. 

161 

162 Parameters 

163 ---------- 

164 *args : `CollectionSummary` 

165 Summaries to combine. 

166 

167 Returns 

168 ------- 

169 unioned : `CollectionSummary` 

170 New summary object that represents the union of the given ones. 

171 """ 

172 result = CollectionSummary() 

173 result.update(*args) 

174 return result 

175 

176 def is_compatible_with( 

177 self, 

178 dataset_type: DatasetType, 

179 dimensions: Mapping[str, Set[str]], 

180 rejections: list[str] | None = None, 

181 name: str | None = None, 

182 ) -> bool: 

183 """Test whether the collection summarized by this object should be 

184 queried for a given dataset type and governor dimension values. 

185 

186 Parameters 

187 ---------- 

188 dataset_type : `DatasetType` 

189 Dataset type being queried. If this collection has no instances of 

190 this dataset type (or its parent dataset type, if it is a 

191 component), `False` will always be returned. 

192 dimensions : `~collections.abc.Mapping` 

193 Bounds on the values governor dimensions can take in the query, 

194 usually from a WHERE expression, as a mapping from dimension name 

195 to a set of `str` governor dimension values. 

196 rejections : `list` [ `str` ], optional 

197 If provided, a list that will be populated with a log- or 

198 exception-friendly message explaining why this dataset is 

199 incompatible with this collection when `False` is returned. 

200 name : `str`, optional 

201 Name of the collection this object summarizes, for use in messages 

202 appended to ``rejections``. Ignored if ``rejections`` is `None`. 

203 

204 Returns 

205 ------- 

206 compatible : `bool` 

207 `True` if the dataset query described by this summary and the given 

208 arguments might yield non-empty results; `False` if the result from 

209 such a query is definitely empty. 

210 """ 

211 parent = dataset_type if not dataset_type.isComponent() else dataset_type.makeCompositeDatasetType() 

212 if parent.name not in self.dataset_types.names: 

213 if rejections is not None: 

214 rejections.append(f"No datasets of type {parent.name} in collection {name!r}.") 

215 return False 

216 for gov_name in self.governors.keys() & dataset_type.dimensions.names & dimensions.keys(): 

217 values_in_collection = self.governors[gov_name] 

218 values_given = dimensions[gov_name] 

219 if values_in_collection.isdisjoint(values_given): 

220 if rejections is not None: 

221 rejections.append( 

222 f"No datasets with {gov_name} in {values_given} in collection {name!r}." 

223 ) 

224 return False 

225 return True 

226 

227 def to_simple(self) -> SerializedCollectionSummary: 

228 return SerializedCollectionSummary( 

229 dataset_types=[x.to_simple() for x in self.dataset_types], 

230 governors=_copy_governors(self.governors), 

231 ) 

232 

233 @staticmethod 

234 def from_simple(simple: SerializedCollectionSummary, universe: DimensionUniverse) -> CollectionSummary: 

235 summary = CollectionSummary() 

236 summary.dataset_types = NamedValueSet( 

237 [DatasetType.from_simple(x, universe) for x in simple.dataset_types] 

238 ) 

239 summary.governors = _copy_governors(simple.governors) 

240 return summary 

241 

242 dataset_types: NamedValueSet[DatasetType] = dataclasses.field(default_factory=NamedValueSet) 

243 """Dataset types that may be present in the collection 

244 (`NamedValueSet` [ `DatasetType` ]). 

245 

246 A dataset type not in this set is definitely not in the collection, but 

247 the converse is not necessarily true. 

248 """ 

249 

250 governors: dict[str, set[str]] = dataclasses.field(default_factory=dict) 

251 """Governor data ID values that are present in the collection's dataset 

252 data IDs (`dict` [ `str`, `set` [ `str` ] ]). 

253 

254 A data ID value not in this restriction is not necessarily inconsistent 

255 with a query in the collection; such a search may only involve dataset 

256 types that do not include one or more governor dimensions in their data 

257 IDs, and hence the values of those data IDs are unconstrained by this 

258 collection in the query. 

259 """ 

260 

261 

262def _copy_governors(governors: dict[str, set[str]]) -> dict[str, set[str]]: 

263 """Make an independent copy of the 'governors' data structure.""" 

264 return {k: v.copy() for k, v in governors.items()} 

265 

266 

267class SerializedCollectionSummary(pydantic.BaseModel): 

268 """Serialized version of CollectionSummary.""" 

269 

270 dataset_types: list[SerializedDatasetType] 

271 governors: dict[str, set[str]]