Coverage for python/lsst/daf/butler/transfers/_yaml.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26import os
27from datetime import datetime
28from typing import (
29 Any,
30 Dict,
31 IO,
32 Iterable,
33 List,
34 Mapping,
35 Optional,
36 Set,
37 Tuple,
38 Type,
39)
40import warnings
41from collections import defaultdict
43import yaml
44import astropy.time
46from lsst.utils import doImport
47from ..core import (
48 DatasetAssociation,
49 DatasetRef,
50 DatasetType,
51 DataCoordinate,
52 Datastore,
53 DimensionElement,
54 DimensionRecord,
55 FileDataset,
56 Timespan,
57)
58from ..core.utils import iterable
59from ..core.named import NamedValueSet
60from ..registry import CollectionType, Registry
61from ..registry.interfaces import (
62 ChainedCollectionRecord,
63 CollectionRecord,
64 RunRecord,
65 VersionTuple,
66)
67from ..registry.versions import IncompatibleVersionError
68from ._interfaces import RepoExportBackend, RepoImportBackend
71EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 1)
72"""Export format version.
74Files with a different major version or a newer minor version cannot be read by
75this version of the code.
76"""
79class YamlRepoExportBackend(RepoExportBackend):
80 """A repository export implementation that saves to a YAML file.
82 Parameters
83 ----------
84 stream
85 A writeable file-like object.
86 """
88 def __init__(self, stream: IO):
89 self.stream = stream
90 self.data: List[Dict[str, Any]] = []
92 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
93 # Docstring inherited from RepoExportBackend.saveDimensionData.
94 data_dicts = [record.toDict(splitTimespan=True) for record in data]
95 self.data.append({
96 "type": "dimension",
97 "element": element.name,
98 "records": data_dicts,
99 })
101 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None:
102 # Docstring inherited from RepoExportBackend.saveCollections.
103 data: Dict[str, Any] = {
104 "type": "collection",
105 "collection_type": record.type.name,
106 "name": record.name,
107 }
108 if doc is not None:
109 data["doc"] = doc
110 if isinstance(record, RunRecord):
111 data["host"] = record.host
112 data["timespan_begin"] = record.timespan.begin
113 data["timespan_end"] = record.timespan.end
114 elif isinstance(record, ChainedCollectionRecord):
115 data["children"] = list(record.children)
116 self.data.append(data)
118 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
119 # Docstring inherited from RepoExportBackend.saveDatasets.
120 self.data.append({
121 "type": "dataset_type",
122 "name": datasetType.name,
123 "dimensions": [d.name for d in datasetType.dimensions],
124 "storage_class": datasetType.storageClass.name,
125 "is_calibration": datasetType.isCalibration(),
126 })
127 self.data.append({
128 "type": "dataset",
129 "dataset_type": datasetType.name,
130 "run": run,
131 "records": [
132 {
133 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
134 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
135 "path": dataset.path,
136 "formatter": dataset.formatter,
137 # TODO: look up and save other collections
138 }
139 for dataset in datasets
140 ]
141 })
143 def saveDatasetAssociations(self, collection: str, collectionType: CollectionType,
144 associations: Iterable[DatasetAssociation]) -> None:
145 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
146 if collectionType is CollectionType.TAGGED:
147 self.data.append({
148 "type": "associations",
149 "collection": collection,
150 "collection_type": collectionType.name,
151 "dataset_ids": [assoc.ref.id for assoc in associations],
152 })
153 elif collectionType is CollectionType.CALIBRATION:
154 idsByTimespan: Dict[Timespan, List[int]] = defaultdict(list)
155 for association in associations:
156 assert association.timespan is not None
157 assert association.ref.id is not None
158 idsByTimespan[association.timespan].append(association.ref.id)
159 self.data.append({
160 "type": "associations",
161 "collection": collection,
162 "collection_type": collectionType.name,
163 "validity_ranges": [
164 {
165 "begin": timespan.begin,
166 "end": timespan.end,
167 "dataset_ids": dataset_ids,
168 }
169 for timespan, dataset_ids in idsByTimespan.items()
170 ]
171 })
173 def finish(self) -> None:
174 # Docstring inherited from RepoExportBackend.
175 yaml.dump(
176 {
177 "description": "Butler Data Repository Export",
178 "version": str(EXPORT_FORMAT_VERSION),
179 "data": self.data,
180 },
181 stream=self.stream,
182 sort_keys=False,
183 )
186class YamlRepoImportBackend(RepoImportBackend):
187 """A repository import implementation that reads from a YAML file.
189 Parameters
190 ----------
191 stream
192 A readable file-like object.
193 registry : `Registry`
194 The registry datasets will be imported into. Only used to retreive
195 dataset types during construction; all write happen in `register`
196 and `load`.
197 """
199 def __init__(self, stream: IO, registry: Registry):
200 # We read the file fully and convert its contents to Python objects
201 # instead of loading incrementally so we can spot some problems early;
202 # because `register` can't be put inside a transaction, we'd rather not
203 # run that at all if there's going to be problem later in `load`.
204 wrapper = yaml.safe_load(stream)
205 if wrapper["version"] == 0:
206 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
207 # before we really tried to do versioning here.
208 fileVersion = VersionTuple(1, 0, 0)
209 else:
210 fileVersion = VersionTuple.fromString(wrapper["version"])
211 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
212 raise IncompatibleVersionError(
213 f"Cannot read repository export file with version={fileVersion} "
214 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
215 )
216 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
217 raise IncompatibleVersionError(
218 f"Cannot read repository export file with version={fileVersion} "
219 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
220 )
221 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {}
222 self.chains: Dict[str, List[str]] = {}
223 self.collections: Dict[str, CollectionType] = {}
224 self.collectionDocs: Dict[str, str] = {}
225 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
226 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
227 self.tagAssociations: Dict[str, List[int]] = defaultdict(list)
228 self.calibAssociations: Dict[str, Dict[Timespan, List[int]]] = defaultdict(dict)
229 self.refsByFileId: Dict[int, DatasetRef] = {}
230 self.registry: Registry = registry
231 datasetData = []
232 for data in wrapper["data"]:
233 if data["type"] == "dimension":
234 # convert all datetime values to astropy
235 for record in data["records"]:
236 for key in record:
237 # Some older YAML files were produced with native
238 # YAML support for datetime, we support reading that
239 # data back. Newer conversion uses _AstropyTimeToYAML
240 # class with special YAML tag.
241 if isinstance(record[key], datetime):
242 record[key] = astropy.time.Time(record[key], scale="utc")
243 element = self.registry.dimensions[data["element"]]
244 RecordClass: Type[DimensionRecord] = element.RecordClass
245 self.dimensions[element].extend(
246 RecordClass(**r) for r in data["records"]
247 )
248 elif data["type"] == "collection":
249 collectionType = CollectionType.__members__[data["collection_type"].upper()]
250 if collectionType is CollectionType.RUN:
251 self.runs[data["name"]] = (
252 data["host"],
253 Timespan(begin=data["timespan_begin"], end=data["timespan_end"])
254 )
255 elif collectionType is CollectionType.CHAINED:
256 children = []
257 for child in data["children"]:
258 if not isinstance(child, str):
259 warnings.warn(
260 f"CHAINED collection {data['name']} includes restrictions on child "
261 "collection searches, which are no longer suppored and will be ignored."
262 )
263 # Old form with dataset type restrictions only,
264 # supported for backwards compatibility.
265 child, _ = child
266 children.append(child)
267 self.chains[data["name"]] = children
268 else:
269 self.collections[data["name"]] = collectionType
270 doc = data.get("doc")
271 if doc is not None:
272 self.collectionDocs[data["name"]] = doc
273 elif data["type"] == "run":
274 # Also support old form of saving a run with no extra info.
275 self.runs[data["name"]] = (None, Timespan(None, None))
276 elif data["type"] == "dataset_type":
277 self.datasetTypes.add(
278 DatasetType(data["name"], dimensions=data["dimensions"],
279 storageClass=data["storage_class"], universe=self.registry.dimensions,
280 isCalibration=data.get("is_calibration", False))
281 )
282 elif data["type"] == "dataset":
283 # Save raw dataset data for a second loop, so we can ensure we
284 # know about all dataset types first.
285 datasetData.append(data)
286 elif data["type"] == "associations":
287 collectionType = CollectionType.__members__[data["collection_type"].upper()]
288 if collectionType is CollectionType.TAGGED:
289 self.tagAssociations[data["collection"]].extend(data["dataset_ids"])
290 elif collectionType is CollectionType.CALIBRATION:
291 assocsByTimespan = self.calibAssociations[data["collection"]]
292 for d in data["validity_ranges"]:
293 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"]
294 else:
295 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
296 else:
297 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
298 # key is (dataset type name, run)
299 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list)
300 for data in datasetData:
301 datasetType = self.datasetTypes.get(data["dataset_type"])
302 if datasetType is None:
303 datasetType = self.registry.getDatasetType(data["dataset_type"])
304 self.datasets[data["dataset_type"], data["run"]].extend(
305 FileDataset(
306 d.get("path"),
307 [DatasetRef(datasetType, dataId, run=data["run"], id=refid)
308 for dataId, refid in zip(iterable(d["data_id"]), iterable(d["dataset_id"]))],
309 formatter=doImport(d.get("formatter")) if "formatter" in d else None
310 )
311 for d in data["records"]
312 )
314 def register(self) -> None:
315 # Docstring inherited from RepoImportBackend.register.
316 for datasetType in self.datasetTypes:
317 self.registry.registerDatasetType(datasetType)
318 for run in self.runs:
319 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
320 # No way to add extra run info to registry yet.
321 for collection, collection_type in self.collections.items():
322 self.registry.registerCollection(collection, collection_type,
323 doc=self.collectionDocs.get(collection))
324 for chain, children in self.chains.items():
325 self.registry.registerCollection(chain, CollectionType.CHAINED,
326 doc=self.collectionDocs.get(chain))
327 self.registry.setCollectionChain(chain, children)
329 def load(self, datastore: Optional[Datastore], *,
330 directory: Optional[str] = None, transfer: Optional[str] = None,
331 skip_dimensions: Optional[Set] = None) -> None:
332 # Docstring inherited from RepoImportBackend.load.
333 for element, dimensionRecords in self.dimensions.items():
334 if skip_dimensions and element in skip_dimensions:
335 continue
336 self.registry.insertDimensionData(element, *dimensionRecords)
337 # FileDatasets to ingest into the datastore (in bulk):
338 fileDatasets = []
339 for (datasetTypeName, run), records in self.datasets.items():
340 datasetType = self.registry.getDatasetType(datasetTypeName)
341 # Make a big flattened list of all data IDs and dataset_ids, while
342 # remembering slices that associate them with the FileDataset
343 # instances they came from.
344 dataIds: List[DataCoordinate] = []
345 dataset_ids: List[int] = []
346 slices = []
347 for fileDataset in records:
348 start = len(dataIds)
349 dataIds.extend(ref.dataId for ref in fileDataset.refs)
350 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore
351 stop = len(dataIds)
352 slices.append(slice(start, stop))
353 # Insert all of those DatasetRefs at once.
354 # For now, we ignore the dataset_id we pulled from the file
355 # and just insert without one to get a new autoincrement value.
356 # Eventually (once we have origin in IDs) we'll preserve them.
357 resolvedRefs = self.registry.insertDatasets(
358 datasetType,
359 dataIds=dataIds,
360 run=run,
361 )
362 # Populate our dictionary that maps int dataset_id values from the
363 # export file to the new DatasetRefs
364 for fileId, ref in zip(dataset_ids, resolvedRefs):
365 self.refsByFileId[fileId] = ref
366 # Now iterate over the original records, and install the new
367 # resolved DatasetRefs to replace the unresolved ones as we
368 # reorganize the collection information.
369 for sliceForFileDataset, fileDataset in zip(slices, records):
370 fileDataset.refs = resolvedRefs[sliceForFileDataset]
371 if directory is not None:
372 fileDataset.path = os.path.join(directory, fileDataset.path)
373 fileDatasets.append(fileDataset)
374 # Ingest everything into the datastore at once.
375 if datastore is not None and fileDatasets:
376 datastore.ingest(*fileDatasets, transfer=transfer)
377 # Associate datasets with tagged collections.
378 for collection, dataset_ids in self.tagAssociations.items():
379 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
380 # Associate datasets with calibration collections.
381 for collection, idsByTimespan in self.calibAssociations.items():
382 for timespan, dataset_ids in idsByTimespan.items():
383 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)