Coverage for python/lsst/daf/butler/transfers/_yaml.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26import os
27from datetime import datetime
28from typing import (
29 Any,
30 Dict,
31 IO,
32 Iterable,
33 List,
34 Mapping,
35 Optional,
36 Set,
37 Tuple,
38 Type,
39)
40from collections import defaultdict
42import yaml
43import astropy.time
45from lsst.utils import doImport
46from ..core import (
47 DatasetAssociation,
48 DatasetRef,
49 DatasetType,
50 DataCoordinate,
51 Datastore,
52 DimensionElement,
53 DimensionRecord,
54 FileDataset,
55 Timespan,
56)
57from ..core.utils import iterable
58from ..core.named import NamedValueSet
59from ..registry import CollectionType, Registry
60from ..registry.wildcards import DatasetTypeRestriction
61from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord
62from ._interfaces import RepoExportBackend, RepoImportBackend
65class YamlRepoExportBackend(RepoExportBackend):
66 """A repository export implementation that saves to a YAML file.
68 Parameters
69 ----------
70 stream
71 A writeable file-like object.
72 """
74 def __init__(self, stream: IO):
75 self.stream = stream
76 self.data: List[Dict[str, Any]] = []
78 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
79 # Docstring inherited from RepoExportBackend.saveDimensionData.
80 data_dicts = [record.toDict(splitTimespan=True) for record in data]
81 self.data.append({
82 "type": "dimension",
83 "element": element.name,
84 "records": data_dicts,
85 })
87 def saveCollection(self, record: CollectionRecord) -> None:
88 # Docstring inherited from RepoExportBackend.saveCollections.
89 data: Dict[str, Any] = {
90 "type": "collection",
91 "collection_type": record.type.name,
92 "name": record.name,
93 }
94 if isinstance(record, RunRecord):
95 data["host"] = record.host
96 data["timespan_begin"] = record.timespan.begin
97 data["timespan_end"] = record.timespan.end
98 elif isinstance(record, ChainedCollectionRecord):
99 data["children"] = [
100 [name, list(restriction.names) if restriction.names is not ... else None] # type: ignore
101 for name, restriction in record.children
102 ]
103 self.data.append(data)
105 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
106 # Docstring inherited from RepoExportBackend.saveDatasets.
107 self.data.append({
108 "type": "dataset_type",
109 "name": datasetType.name,
110 "dimensions": [d.name for d in datasetType.dimensions],
111 "storage_class": datasetType.storageClass.name,
112 "is_calibration": datasetType.isCalibration(),
113 })
114 self.data.append({
115 "type": "dataset",
116 "dataset_type": datasetType.name,
117 "run": run,
118 "records": [
119 {
120 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
121 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
122 "path": dataset.path,
123 "formatter": dataset.formatter,
124 # TODO: look up and save other collections
125 }
126 for dataset in datasets
127 ]
128 })
130 def saveDatasetAssociations(self, collection: str, collectionType: CollectionType,
131 associations: Iterable[DatasetAssociation]) -> None:
132 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
133 if collectionType is CollectionType.TAGGED:
134 self.data.append({
135 "type": "associations",
136 "collection": collection,
137 "collection_type": collectionType.name,
138 "dataset_ids": [assoc.ref.id for assoc in associations],
139 })
140 elif collectionType is CollectionType.CALIBRATION:
141 idsByTimespan: Dict[Timespan, List[int]] = defaultdict(list)
142 for association in associations:
143 assert association.timespan is not None
144 assert association.ref.id is not None
145 idsByTimespan[association.timespan].append(association.ref.id)
146 self.data.append({
147 "type": "associations",
148 "collection": collection,
149 "collection_type": collectionType.name,
150 "validity_ranges": [
151 {
152 "begin": timespan.begin,
153 "end": timespan.end,
154 "dataset_ids": dataset_ids,
155 }
156 for timespan, dataset_ids in idsByTimespan.items()
157 ]
158 })
160 def finish(self) -> None:
161 # Docstring inherited from RepoExportBackend.
162 yaml.dump(
163 {
164 "description": "Butler Data Repository Export",
165 "version": 0,
166 "data": self.data,
167 },
168 stream=self.stream,
169 sort_keys=False,
170 )
173class YamlRepoImportBackend(RepoImportBackend):
174 """A repository import implementation that reads from a YAML file.
176 Parameters
177 ----------
178 stream
179 A readable file-like object.
180 registry : `Registry`
181 The registry datasets will be imported into. Only used to retreive
182 dataset types during construction; all write happen in `register`
183 and `load`.
184 """
186 def __init__(self, stream: IO, registry: Registry):
187 # We read the file fully and convert its contents to Python objects
188 # instead of loading incrementally so we can spot some problems early;
189 # because `register` can't be put inside a transaction, we'd rather not
190 # run that at all if there's going to be problem later in `load`.
191 wrapper = yaml.safe_load(stream)
192 # TODO: When version numbers become meaningful, check here that we can
193 # read the version in the file.
194 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {}
195 self.chains: Dict[str, List[Tuple[str, DatasetTypeRestriction]]] = {}
196 self.collections: Dict[str, CollectionType] = {}
197 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
198 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
199 self.tagAssociations: Dict[str, List[int]] = defaultdict(list)
200 self.calibAssociations: Dict[str, Dict[Timespan, List[int]]] = defaultdict(dict)
201 self.refsByFileId: Dict[int, DatasetRef] = {}
202 self.registry: Registry = registry
203 datasetData = []
204 for data in wrapper["data"]:
205 if data["type"] == "dimension":
206 # convert all datetime values to astropy
207 for record in data["records"]:
208 for key in record:
209 # Some older YAML files were produced with native
210 # YAML support for datetime, we support reading that
211 # data back. Newer conversion uses _AstropyTimeToYAML
212 # class with special YAML tag.
213 if isinstance(record[key], datetime):
214 record[key] = astropy.time.Time(record[key], scale="utc")
215 element = self.registry.dimensions[data["element"]]
216 RecordClass: Type[DimensionRecord] = element.RecordClass
217 self.dimensions[element].extend(
218 RecordClass(**r) for r in data["records"]
219 )
220 elif data["type"] == "collection":
221 collectionType = CollectionType.__members__[data["collection_type"].upper()]
222 if collectionType is CollectionType.RUN:
223 self.runs[data["name"]] = (
224 data["host"],
225 Timespan(begin=data["timespan_begin"], end=data["timespan_end"])
226 )
227 elif collectionType is CollectionType.CHAINED:
228 children = []
229 for name, restriction_data in data["children"]:
230 if restriction_data is None:
231 restriction = DatasetTypeRestriction.any
232 else:
233 restriction = DatasetTypeRestriction.fromExpression(restriction_data)
234 children.append((name, restriction))
235 self.chains[data["name"]] = children
236 else:
237 self.collections[data["name"]] = collectionType
238 elif data["type"] == "run":
239 # Also support old form of saving a run with no extra info.
240 self.runs[data["name"]] = (None, Timespan(None, None))
241 elif data["type"] == "dataset_type":
242 self.datasetTypes.add(
243 DatasetType(data["name"], dimensions=data["dimensions"],
244 storageClass=data["storage_class"], universe=self.registry.dimensions,
245 isCalibration=data.get("is_calibration", False))
246 )
247 elif data["type"] == "dataset":
248 # Save raw dataset data for a second loop, so we can ensure we
249 # know about all dataset types first.
250 datasetData.append(data)
251 elif data["type"] == "associations":
252 collectionType = CollectionType.__members__[data["collection_type"].upper()]
253 if collectionType is CollectionType.TAGGED:
254 self.tagAssociations[data["collection"]].extend(data["dataset_ids"])
255 elif collectionType is CollectionType.CALIBRATION:
256 assocsByTimespan = self.calibAssociations[data["collection"]]
257 for d in data["validity_ranges"]:
258 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"]
259 else:
260 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
261 else:
262 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
263 # key is (dataset type name, run)
264 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list)
265 for data in datasetData:
266 datasetType = self.datasetTypes.get(data["dataset_type"])
267 if datasetType is None:
268 datasetType = self.registry.getDatasetType(data["dataset_type"])
269 self.datasets[data["dataset_type"], data["run"]].extend(
270 FileDataset(
271 d.get("path"),
272 [DatasetRef(datasetType, dataId, run=data["run"], id=refid)
273 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))],
274 formatter=doImport(d.get("formatter")) if "formatter" in d else None
275 )
276 for d in data["records"]
277 )
279 def register(self) -> None:
280 # Docstring inherited from RepoImportBackend.register.
281 for datasetType in self.datasetTypes:
282 self.registry.registerDatasetType(datasetType)
283 for run in self.runs:
284 self.registry.registerRun(run)
285 # No way to add extra run info to registry yet.
286 for collection, collection_type in self.collections.items():
287 self.registry.registerCollection(collection, collection_type)
288 for chain, children in self.chains.items():
289 self.registry.registerCollection(chain, CollectionType.CHAINED)
290 self.registry.setCollectionChain(chain, children)
292 def load(self, datastore: Optional[Datastore], *,
293 directory: Optional[str] = None, transfer: Optional[str] = None,
294 skip_dimensions: Optional[Set] = None) -> None:
295 # Docstring inherited from RepoImportBackend.load.
296 for element, dimensionRecords in self.dimensions.items():
297 if skip_dimensions and element in skip_dimensions:
298 continue
299 self.registry.insertDimensionData(element, *dimensionRecords)
300 # FileDatasets to ingest into the datastore (in bulk):
301 fileDatasets = []
302 for (datasetTypeName, run), records in self.datasets.items():
303 datasetType = self.registry.getDatasetType(datasetTypeName)
304 # Make a big flattened list of all data IDs and dataset_ids, while
305 # remembering slices that associate them with the FileDataset
306 # instances they came from.
307 dataIds: List[DataCoordinate] = []
308 dataset_ids: List[int] = []
309 slices = []
310 for fileDataset in records:
311 start = len(dataIds)
312 dataIds.extend(ref.dataId for ref in fileDataset.refs)
313 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore
314 stop = len(dataIds)
315 slices.append(slice(start, stop))
316 # Insert all of those DatasetRefs at once.
317 # For now, we ignore the dataset_id we pulled from the file
318 # and just insert without one to get a new autoincrement value.
319 # Eventually (once we have origin in IDs) we'll preserve them.
320 resolvedRefs = self.registry.insertDatasets(
321 datasetType,
322 dataIds=dataIds,
323 run=run,
324 )
325 # Populate our dictionary that maps int dataset_id values from the
326 # export file to the new DatasetRefs
327 for fileId, ref in zip(dataset_ids, resolvedRefs):
328 self.refsByFileId[fileId] = ref
329 # Now iterate over the original records, and install the new
330 # resolved DatasetRefs to replace the unresolved ones as we
331 # reorganize the collection information.
332 for sliceForFileDataset, fileDataset in zip(slices, records):
333 fileDataset.refs = resolvedRefs[sliceForFileDataset]
334 if directory is not None:
335 fileDataset.path = os.path.join(directory, fileDataset.path)
336 fileDatasets.append(fileDataset)
337 # Ingest everything into the datastore at once.
338 if datastore is not None and fileDatasets:
339 datastore.ingest(*fileDatasets, transfer=transfer)
340 # Associate datasets with tagged collections.
341 for collection, dataset_ids in self.tagAssociations.items():
342 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
343 # Associate datasets with calibration collections.
344 for collection, idsByTimespan in self.calibAssociations.items():
345 for timespan, dataset_ids in idsByTimespan.items():
346 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)