Coverage for python/lsst/daf/butler/transfers/_yaml.py: 13%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26import uuid
27import warnings
28from collections import defaultdict
29from datetime import datetime
30from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Type
32import astropy.time
33import yaml
34from lsst.resources import ResourcePath
35from lsst.utils import doImportType
36from lsst.utils.iteration import ensure_iterable
38from ..core import (
39 DatasetAssociation,
40 DatasetId,
41 DatasetRef,
42 DatasetType,
43 Datastore,
44 DimensionElement,
45 DimensionRecord,
46 FileDataset,
47 Timespan,
48)
49from ..core.named import NamedValueSet
50from ..registry import CollectionType, Registry
51from ..registry.interfaces import (
52 ChainedCollectionRecord,
53 CollectionRecord,
54 DatasetIdGenEnum,
55 RunRecord,
56 VersionTuple,
57)
58from ..registry.versions import IncompatibleVersionError
59from ._interfaces import RepoExportBackend, RepoImportBackend
61EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 1)
62"""Export format version.
64Files with a different major version or a newer minor version cannot be read by
65this version of the code.
66"""
69def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
70 """Generate YAML representation for UUID.
72 This produces a scalar node with a tag "!uuid" and value being a regular
73 string representation of UUID.
74 """
75 return dumper.represent_scalar("!uuid", str(data))
78def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> Optional[uuid.UUID]:
79 if node.value is not None:
80 return uuid.UUID(hex=node.value)
81 return None
84yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
85yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
88class YamlRepoExportBackend(RepoExportBackend):
89 """A repository export implementation that saves to a YAML file.
91 Parameters
92 ----------
93 stream
94 A writeable file-like object.
95 """
97 def __init__(self, stream: IO):
98 self.stream = stream
99 self.data: List[Dict[str, Any]] = []
101 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
102 # Docstring inherited from RepoExportBackend.saveDimensionData.
103 data_dicts = [record.toDict(splitTimespan=True) for record in data]
104 self.data.append(
105 {
106 "type": "dimension",
107 "element": element.name,
108 "records": data_dicts,
109 }
110 )
112 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None:
113 # Docstring inherited from RepoExportBackend.saveCollections.
114 data: Dict[str, Any] = {
115 "type": "collection",
116 "collection_type": record.type.name,
117 "name": record.name,
118 }
119 if doc is not None:
120 data["doc"] = doc
121 if isinstance(record, RunRecord):
122 data["host"] = record.host
123 data["timespan_begin"] = record.timespan.begin
124 data["timespan_end"] = record.timespan.end
125 elif isinstance(record, ChainedCollectionRecord):
126 data["children"] = list(record.children)
127 self.data.append(data)
129 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
130 # Docstring inherited from RepoExportBackend.saveDatasets.
131 self.data.append(
132 {
133 "type": "dataset_type",
134 "name": datasetType.name,
135 "dimensions": [d.name for d in datasetType.dimensions],
136 "storage_class": datasetType.storageClass.name,
137 "is_calibration": datasetType.isCalibration(),
138 }
139 )
140 self.data.append(
141 {
142 "type": "dataset",
143 "dataset_type": datasetType.name,
144 "run": run,
145 "records": [
146 {
147 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
148 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
149 "path": dataset.path,
150 "formatter": dataset.formatter,
151 # TODO: look up and save other collections
152 }
153 for dataset in datasets
154 ],
155 }
156 )
158 def saveDatasetAssociations(
159 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation]
160 ) -> None:
161 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
162 if collectionType is CollectionType.TAGGED:
163 self.data.append(
164 {
165 "type": "associations",
166 "collection": collection,
167 "collection_type": collectionType.name,
168 "dataset_ids": [assoc.ref.id for assoc in associations],
169 }
170 )
171 elif collectionType is CollectionType.CALIBRATION:
172 idsByTimespan: Dict[Timespan, List[DatasetId]] = defaultdict(list)
173 for association in associations:
174 assert association.timespan is not None
175 assert association.ref.id is not None
176 idsByTimespan[association.timespan].append(association.ref.id)
177 self.data.append(
178 {
179 "type": "associations",
180 "collection": collection,
181 "collection_type": collectionType.name,
182 "validity_ranges": [
183 {
184 "timespan": timespan,
185 "dataset_ids": dataset_ids,
186 }
187 for timespan, dataset_ids in idsByTimespan.items()
188 ],
189 }
190 )
192 def finish(self) -> None:
193 # Docstring inherited from RepoExportBackend.
194 yaml.dump(
195 {
196 "description": "Butler Data Repository Export",
197 "version": str(EXPORT_FORMAT_VERSION),
198 "data": self.data,
199 },
200 stream=self.stream,
201 sort_keys=False,
202 )
205class YamlRepoImportBackend(RepoImportBackend):
206 """A repository import implementation that reads from a YAML file.
208 Parameters
209 ----------
210 stream
211 A readable file-like object.
212 registry : `Registry`
213 The registry datasets will be imported into. Only used to retreive
214 dataset types during construction; all write happen in `register`
215 and `load`.
216 """
218 def __init__(self, stream: IO, registry: Registry):
219 # We read the file fully and convert its contents to Python objects
220 # instead of loading incrementally so we can spot some problems early;
221 # because `register` can't be put inside a transaction, we'd rather not
222 # run that at all if there's going to be problem later in `load`.
223 wrapper = yaml.safe_load(stream)
224 if wrapper["version"] == 0:
225 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
226 # before we really tried to do versioning here.
227 fileVersion = VersionTuple(1, 0, 0)
228 else:
229 fileVersion = VersionTuple.fromString(wrapper["version"])
230 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
231 raise IncompatibleVersionError(
232 f"Cannot read repository export file with version={fileVersion} "
233 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
234 )
235 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
236 raise IncompatibleVersionError(
237 f"Cannot read repository export file with version={fileVersion} "
238 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
239 )
240 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {}
241 self.chains: Dict[str, List[str]] = {}
242 self.collections: Dict[str, CollectionType] = {}
243 self.collectionDocs: Dict[str, str] = {}
244 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
245 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
246 self.tagAssociations: Dict[str, List[DatasetId]] = defaultdict(list)
247 self.calibAssociations: Dict[str, Dict[Timespan, List[DatasetId]]] = defaultdict(dict)
248 self.refsByFileId: Dict[DatasetId, DatasetRef] = {}
249 self.registry: Registry = registry
250 datasetData = []
251 for data in wrapper["data"]:
252 if data["type"] == "dimension":
253 # convert all datetime values to astropy
254 for record in data["records"]:
255 for key in record:
256 # Some older YAML files were produced with native
257 # YAML support for datetime, we support reading that
258 # data back. Newer conversion uses _AstropyTimeToYAML
259 # class with special YAML tag.
260 if isinstance(record[key], datetime):
261 record[key] = astropy.time.Time(record[key], scale="utc")
262 element = self.registry.dimensions[data["element"]]
263 RecordClass: Type[DimensionRecord] = element.RecordClass
264 self.dimensions[element].extend(RecordClass(**r) for r in data["records"])
265 elif data["type"] == "collection":
266 collectionType = CollectionType.from_name(data["collection_type"])
267 if collectionType is CollectionType.RUN:
268 self.runs[data["name"]] = (
269 data["host"],
270 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]),
271 )
272 elif collectionType is CollectionType.CHAINED:
273 children = []
274 for child in data["children"]:
275 if not isinstance(child, str):
276 warnings.warn(
277 f"CHAINED collection {data['name']} includes restrictions on child "
278 "collection searches, which are no longer suppored and will be ignored."
279 )
280 # Old form with dataset type restrictions only,
281 # supported for backwards compatibility.
282 child, _ = child
283 children.append(child)
284 self.chains[data["name"]] = children
285 else:
286 self.collections[data["name"]] = collectionType
287 doc = data.get("doc")
288 if doc is not None:
289 self.collectionDocs[data["name"]] = doc
290 elif data["type"] == "run":
291 # Also support old form of saving a run with no extra info.
292 self.runs[data["name"]] = (None, Timespan(None, None))
293 elif data["type"] == "dataset_type":
294 self.datasetTypes.add(
295 DatasetType(
296 data["name"],
297 dimensions=data["dimensions"],
298 storageClass=data["storage_class"],
299 universe=self.registry.dimensions,
300 isCalibration=data.get("is_calibration", False),
301 )
302 )
303 elif data["type"] == "dataset":
304 # Save raw dataset data for a second loop, so we can ensure we
305 # know about all dataset types first.
306 datasetData.append(data)
307 elif data["type"] == "associations":
308 collectionType = CollectionType.from_name(data["collection_type"])
309 if collectionType is CollectionType.TAGGED:
310 self.tagAssociations[data["collection"]].extend(data["dataset_ids"])
311 elif collectionType is CollectionType.CALIBRATION:
312 assocsByTimespan = self.calibAssociations[data["collection"]]
313 for d in data["validity_ranges"]:
314 if "timespan" in d:
315 assocsByTimespan[d["timespan"]] = d["dataset_ids"]
316 else:
317 # TODO: this is for backward compatibility, should
318 # be removed at some point.
319 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"]
320 else:
321 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
322 else:
323 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
324 # key is (dataset type name, run)
325 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list)
326 for data in datasetData:
327 datasetType = self.datasetTypes.get(data["dataset_type"])
328 if datasetType is None:
329 datasetType = self.registry.getDatasetType(data["dataset_type"])
330 self.datasets[data["dataset_type"], data["run"]].extend(
331 FileDataset(
332 d.get("path"),
333 [
334 DatasetRef(datasetType, dataId, run=data["run"], id=refid)
335 for dataId, refid in zip(
336 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"])
337 )
338 ],
339 formatter=doImportType(d.get("formatter")) if "formatter" in d else None,
340 )
341 for d in data["records"]
342 )
344 def register(self) -> None:
345 # Docstring inherited from RepoImportBackend.register.
346 for datasetType in self.datasetTypes:
347 self.registry.registerDatasetType(datasetType)
348 for run in self.runs:
349 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
350 # No way to add extra run info to registry yet.
351 for collection, collection_type in self.collections.items():
352 self.registry.registerCollection(
353 collection, collection_type, doc=self.collectionDocs.get(collection)
354 )
355 for chain, children in self.chains.items():
356 self.registry.registerCollection(
357 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain)
358 )
359 self.registry.setCollectionChain(chain, children)
361 def load(
362 self,
363 datastore: Optional[Datastore],
364 *,
365 directory: Optional[str] = None,
366 transfer: Optional[str] = None,
367 skip_dimensions: Optional[Set] = None,
368 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
369 reuseIds: bool = False,
370 ) -> None:
371 # Docstring inherited from RepoImportBackend.load.
372 for element, dimensionRecords in self.dimensions.items():
373 if skip_dimensions and element in skip_dimensions:
374 continue
375 self.registry.insertDimensionData(element, *dimensionRecords)
376 # FileDatasets to ingest into the datastore (in bulk):
377 fileDatasets = []
378 for (datasetTypeName, run), records in self.datasets.items():
379 # Make a big flattened list of all data IDs and dataset_ids, while
380 # remembering slices that associate them with the FileDataset
381 # instances they came from.
382 datasets: List[DatasetRef] = []
383 dataset_ids: List[DatasetId] = []
384 slices = []
385 for fileDataset in records:
386 start = len(datasets)
387 datasets.extend(fileDataset.refs)
388 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore
389 stop = len(datasets)
390 slices.append(slice(start, stop))
391 # Insert all of those DatasetRefs at once.
392 # For now, we ignore the dataset_id we pulled from the file
393 # and just insert without one to get a new autoincrement value.
394 # Eventually (once we have origin in IDs) we'll preserve them.
395 resolvedRefs = self.registry._importDatasets(
396 datasets, idGenerationMode=idGenerationMode, reuseIds=reuseIds
397 )
398 # Populate our dictionary that maps int dataset_id values from the
399 # export file to the new DatasetRefs
400 for fileId, ref in zip(dataset_ids, resolvedRefs):
401 self.refsByFileId[fileId] = ref
402 # Now iterate over the original records, and install the new
403 # resolved DatasetRefs to replace the unresolved ones as we
404 # reorganize the collection information.
405 for sliceForFileDataset, fileDataset in zip(slices, records):
406 fileDataset.refs = resolvedRefs[sliceForFileDataset]
407 if directory is not None:
408 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path)
409 fileDatasets.append(fileDataset)
410 # Ingest everything into the datastore at once.
411 if datastore is not None and fileDatasets:
412 datastore.ingest(*fileDatasets, transfer=transfer)
413 # Associate datasets with tagged collections.
414 for collection, dataset_ids in self.tagAssociations.items():
415 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
416 # Associate datasets with calibration collections.
417 for collection, idsByTimespan in self.calibAssociations.items():
418 for timespan, dataset_ids in idsByTimespan.items():
419 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)