Coverage for python/lsst/daf/butler/transfers/_yaml.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26import os
27from datetime import datetime
28from typing import (
29 Any,
30 Dict,
31 IO,
32 List,
33 Mapping,
34 Optional,
35 Set,
36 Tuple,
37 Type,
38)
39from collections import defaultdict
41import yaml
42import astropy.time
44from lsst.utils import doImport
45from ..core import (
46 DatasetRef,
47 DatasetType,
48 DataCoordinate,
49 Datastore,
50 DimensionElement,
51 DimensionRecord,
52 FileDataset,
53)
54from ..core.utils import iterable
55from ..core.named import NamedValueSet
56from ..registry import Registry
57from ._interfaces import RepoExportBackend, RepoImportBackend
60class YamlRepoExportBackend(RepoExportBackend):
61 """A repository export implementation that saves to a YAML file.
63 Parameters
64 ----------
65 stream
66 A writeable file-like object.
67 """
69 def __init__(self, stream: IO):
70 self.stream = stream
71 self.data: List[Dict[str, Any]] = []
73 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
74 # Docstring inherited from RepoExportBackend.saveDimensionData.
75 data_dicts = [record.toDict(splitTimespan=True) for record in data]
76 self.data.append({
77 "type": "dimension",
78 "element": element.name,
79 "records": data_dicts,
80 })
82 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
83 # Docstring inherited from RepoExportBackend.saveDatasets.
84 self.data.append({
85 "type": "dataset_type",
86 "name": datasetType.name,
87 "dimensions": [d.name for d in datasetType.dimensions],
88 "storage_class": datasetType.storageClass.name,
89 })
90 self.data.append({
91 "type": "run",
92 "name": run,
93 })
94 self.data.append({
95 "type": "dataset",
96 "dataset_type": datasetType.name,
97 "run": run,
98 "records": [
99 {
100 "dataset_id": [ref.id for ref in dataset.refs],
101 "data_id": [ref.dataId.byName() for ref in dataset.refs],
102 "path": dataset.path,
103 "formatter": dataset.formatter,
104 # TODO: look up and save other collections
105 }
106 for dataset in datasets
107 ]
108 })
110 def finish(self) -> None:
111 # Docstring inherited from RepoExportBackend.
112 yaml.dump(
113 {
114 "description": "Butler Data Repository Export",
115 "version": 0,
116 "data": self.data,
117 },
118 stream=self.stream,
119 sort_keys=False,
120 )
123class YamlRepoImportBackend(RepoImportBackend):
124 """A repository import implementation that reads from a YAML file.
126 Parameters
127 ----------
128 stream
129 A readable file-like object.
130 registry : `Registry`
131 The registry datasets will be imported into. Only used to retreive
132 dataset types during construction; all write happen in `register`
133 and `load`.
134 """
136 def __init__(self, stream: IO, registry: Registry):
137 # We read the file fully and convert its contents to Python objects
138 # instead of loading incrementally so we can spot some problems early;
139 # because `register` can't be put inside a transaction, we'd rather not
140 # run that at all if there's going to be problem later in `load`.
141 wrapper = yaml.safe_load(stream)
142 # TODO: When version numbers become meaningful, check here that we can
143 # read the version in the file.
144 self.runs: List[str] = []
145 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
146 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
147 self.registry: Registry = registry
148 datasetData = []
149 for data in wrapper["data"]:
150 if data["type"] == "dimension":
151 # convert all datetiem values to astropy
152 for record in data["records"]:
153 for key in record:
154 # Some older YAML files were produced with native
155 # YAML support for datetime, we support reading that
156 # data back. Newer conversion uses _AstropyTimeToYAML
157 # class with special YAML tag.
158 if isinstance(record[key], datetime):
159 record[key] = astropy.time.Time(record[key], scale="utc")
160 element = self.registry.dimensions[data["element"]]
161 RecordClass: Type[DimensionRecord] = element.RecordClass
162 self.dimensions[element].extend(
163 RecordClass(**r) for r in data["records"]
164 )
165 elif data["type"] == "run":
166 self.runs.append(data["name"])
167 elif data["type"] == "dataset_type":
168 self.datasetTypes.add(
169 DatasetType(data["name"], dimensions=data["dimensions"],
170 storageClass=data["storage_class"], universe=self.registry.dimensions)
171 )
172 elif data["type"] == "dataset":
173 # Save raw dataset data for a second loop, so we can ensure we
174 # know about all dataset types first.
175 datasetData.append(data)
176 else:
177 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
178 # key is (dataset type name, run); inner most list is collections
179 self.datasets: Mapping[Tuple[str, str], List[Tuple[FileDataset, List[str]]]] = defaultdict(list)
180 for data in datasetData:
181 datasetType = self.datasetTypes.get(data["dataset_type"])
182 if datasetType is None:
183 datasetType = self.registry.getDatasetType(data["dataset_type"])
184 self.datasets[data["dataset_type"], data["run"]].extend(
185 (
186 FileDataset(
187 d.get("path"),
188 [DatasetRef(datasetType, dataId, run=data["run"], id=refid)
189 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))],
190 formatter=doImport(d.get("formatter")) if "formatter" in d else None
191 ),
192 d.get("collections", [])
193 )
194 for d in data["records"]
195 )
197 def register(self) -> None:
198 # Docstring inherited from RepoImportBackend.register.
199 for run in self.runs:
200 self.registry.registerRun(run)
201 for datasetType in self.datasetTypes:
202 self.registry.registerDatasetType(datasetType)
204 def load(self, datastore: Optional[Datastore], *,
205 directory: Optional[str] = None, transfer: Optional[str] = None,
206 skip_dimensions: Optional[Set] = None) -> None:
207 # Docstring inherited from RepoImportBackend.load.
208 for element, dimensionRecords in self.dimensions.items():
209 if skip_dimensions and element in skip_dimensions:
210 continue
211 self.registry.insertDimensionData(element, *dimensionRecords)
212 # Mapping from collection name to list of DatasetRefs to associate.
213 collections = defaultdict(list)
214 # FileDatasets to ingest into the datastore (in bulk):
215 fileDatasets = []
216 for (datasetTypeName, run), records in self.datasets.items():
217 datasetType = self.registry.getDatasetType(datasetTypeName)
218 # Make a big flattened list of all data IDs, while remembering
219 # slices that associate them with the FileDataset instances they
220 # came from.
221 dataIds: List[DataCoordinate] = []
222 slices = []
223 for fileDataset, _ in records:
224 start = len(dataIds)
225 dataIds.extend(ref.dataId for ref in fileDataset.refs)
226 stop = len(dataIds)
227 slices.append(slice(start, stop))
228 # Insert all of those DatasetRefs at once.
229 # For now, we ignore the dataset_id we pulled from the file
230 # and just insert without one to get a new autoincrement value.
231 # Eventually (once we have origin in IDs) we'll preserve them.
232 resolvedRefs = self.registry.insertDatasets(
233 datasetType,
234 dataIds=dataIds,
235 run=run,
236 )
237 # Now iterate over the original records, and install the new
238 # resolved DatasetRefs to replace the unresolved ones as we
239 # reorganize the collection information.
240 for sliceForFileDataset, (fileDataset, collectionsForDataset) in zip(slices, records):
241 fileDataset.refs = resolvedRefs[sliceForFileDataset]
242 if directory is not None:
243 fileDataset.path = os.path.join(directory, fileDataset.path)
244 fileDatasets.append(fileDataset)
245 for collection in collectionsForDataset:
246 collections[collection].extend(fileDataset.refs)
247 # Ingest everything into the datastore at once.
248 if datastore is not None and fileDatasets:
249 datastore.ingest(*fileDatasets, transfer=transfer)
250 # Associate with collections, one collection at a time.
251 for collection, refs in collections.items():
252 self.registry.associate(collection, refs)