Coverage for python/lsst/daf/butler/transfers/_yaml.py: 12%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26from datetime import datetime
27from typing import (
28 Any,
29 Dict,
30 IO,
31 Iterable,
32 List,
33 Mapping,
34 Optional,
35 Set,
36 Tuple,
37 Type,
38)
39import uuid
40import warnings
41from collections import defaultdict
43import yaml
44import astropy.time
46from lsst.utils import doImportType
47from lsst.utils.iteration import ensure_iterable
48from ..core import (
49 DatasetAssociation,
50 DatasetId,
51 DatasetRef,
52 DatasetType,
53 Datastore,
54 DimensionElement,
55 DimensionRecord,
56 FileDataset,
57 Timespan,
58)
59from ..core._butlerUri import ButlerURI
60from ..core.named import NamedValueSet
61from ..registry import CollectionType, Registry
62from ..registry.interfaces import (
63 ChainedCollectionRecord,
64 CollectionRecord,
65 DatasetIdGenEnum,
66 RunRecord,
67 VersionTuple,
68)
69from ..registry.versions import IncompatibleVersionError
70from ._interfaces import RepoExportBackend, RepoImportBackend
73EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 1)
74"""Export format version.
76Files with a different major version or a newer minor version cannot be read by
77this version of the code.
78"""
81def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
82 """Generate YAML representation for UUID.
84 This produces a scalar node with a tag "!uuid" and value being a regular
85 string representation of UUID.
86 """
87 return dumper.represent_scalar("!uuid", str(data))
90def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> Optional[uuid.UUID]:
91 if node.value is not None:
92 return uuid.UUID(hex=node.value)
93 return None
96yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
97yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
100class YamlRepoExportBackend(RepoExportBackend):
101 """A repository export implementation that saves to a YAML file.
103 Parameters
104 ----------
105 stream
106 A writeable file-like object.
107 """
109 def __init__(self, stream: IO):
110 self.stream = stream
111 self.data: List[Dict[str, Any]] = []
113 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
114 # Docstring inherited from RepoExportBackend.saveDimensionData.
115 data_dicts = [record.toDict(splitTimespan=True) for record in data]
116 self.data.append({
117 "type": "dimension",
118 "element": element.name,
119 "records": data_dicts,
120 })
122 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None:
123 # Docstring inherited from RepoExportBackend.saveCollections.
124 data: Dict[str, Any] = {
125 "type": "collection",
126 "collection_type": record.type.name,
127 "name": record.name,
128 }
129 if doc is not None:
130 data["doc"] = doc
131 if isinstance(record, RunRecord):
132 data["host"] = record.host
133 data["timespan_begin"] = record.timespan.begin
134 data["timespan_end"] = record.timespan.end
135 elif isinstance(record, ChainedCollectionRecord):
136 data["children"] = list(record.children)
137 self.data.append(data)
139 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
140 # Docstring inherited from RepoExportBackend.saveDatasets.
141 self.data.append({
142 "type": "dataset_type",
143 "name": datasetType.name,
144 "dimensions": [d.name for d in datasetType.dimensions],
145 "storage_class": datasetType.storageClass.name,
146 "is_calibration": datasetType.isCalibration(),
147 })
148 self.data.append({
149 "type": "dataset",
150 "dataset_type": datasetType.name,
151 "run": run,
152 "records": [
153 {
154 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
155 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
156 "path": dataset.path,
157 "formatter": dataset.formatter,
158 # TODO: look up and save other collections
159 }
160 for dataset in datasets
161 ]
162 })
164 def saveDatasetAssociations(self, collection: str, collectionType: CollectionType,
165 associations: Iterable[DatasetAssociation]) -> None:
166 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
167 if collectionType is CollectionType.TAGGED:
168 self.data.append({
169 "type": "associations",
170 "collection": collection,
171 "collection_type": collectionType.name,
172 "dataset_ids": [assoc.ref.id for assoc in associations],
173 })
174 elif collectionType is CollectionType.CALIBRATION:
175 idsByTimespan: Dict[Timespan, List[DatasetId]] = defaultdict(list)
176 for association in associations:
177 assert association.timespan is not None
178 assert association.ref.id is not None
179 idsByTimespan[association.timespan].append(association.ref.id)
180 self.data.append({
181 "type": "associations",
182 "collection": collection,
183 "collection_type": collectionType.name,
184 "validity_ranges": [
185 {
186 "timespan": timespan,
187 "dataset_ids": dataset_ids,
188 }
189 for timespan, dataset_ids in idsByTimespan.items()
190 ]
191 })
193 def finish(self) -> None:
194 # Docstring inherited from RepoExportBackend.
195 yaml.dump(
196 {
197 "description": "Butler Data Repository Export",
198 "version": str(EXPORT_FORMAT_VERSION),
199 "data": self.data,
200 },
201 stream=self.stream,
202 sort_keys=False,
203 )
206class YamlRepoImportBackend(RepoImportBackend):
207 """A repository import implementation that reads from a YAML file.
209 Parameters
210 ----------
211 stream
212 A readable file-like object.
213 registry : `Registry`
214 The registry datasets will be imported into. Only used to retreive
215 dataset types during construction; all write happen in `register`
216 and `load`.
217 """
219 def __init__(self, stream: IO, registry: Registry):
220 # We read the file fully and convert its contents to Python objects
221 # instead of loading incrementally so we can spot some problems early;
222 # because `register` can't be put inside a transaction, we'd rather not
223 # run that at all if there's going to be problem later in `load`.
224 wrapper = yaml.safe_load(stream)
225 if wrapper["version"] == 0:
226 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
227 # before we really tried to do versioning here.
228 fileVersion = VersionTuple(1, 0, 0)
229 else:
230 fileVersion = VersionTuple.fromString(wrapper["version"])
231 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
232 raise IncompatibleVersionError(
233 f"Cannot read repository export file with version={fileVersion} "
234 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
235 )
236 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
237 raise IncompatibleVersionError(
238 f"Cannot read repository export file with version={fileVersion} "
239 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
240 )
241 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {}
242 self.chains: Dict[str, List[str]] = {}
243 self.collections: Dict[str, CollectionType] = {}
244 self.collectionDocs: Dict[str, str] = {}
245 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
246 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
247 self.tagAssociations: Dict[str, List[DatasetId]] = defaultdict(list)
248 self.calibAssociations: Dict[str, Dict[Timespan, List[DatasetId]]] = defaultdict(dict)
249 self.refsByFileId: Dict[DatasetId, DatasetRef] = {}
250 self.registry: Registry = registry
251 datasetData = []
252 for data in wrapper["data"]:
253 if data["type"] == "dimension":
254 # convert all datetime values to astropy
255 for record in data["records"]:
256 for key in record:
257 # Some older YAML files were produced with native
258 # YAML support for datetime, we support reading that
259 # data back. Newer conversion uses _AstropyTimeToYAML
260 # class with special YAML tag.
261 if isinstance(record[key], datetime):
262 record[key] = astropy.time.Time(record[key], scale="utc")
263 element = self.registry.dimensions[data["element"]]
264 RecordClass: Type[DimensionRecord] = element.RecordClass
265 self.dimensions[element].extend(
266 RecordClass(**r) for r in data["records"]
267 )
268 elif data["type"] == "collection":
269 collectionType = CollectionType.from_name(data["collection_type"])
270 if collectionType is CollectionType.RUN:
271 self.runs[data["name"]] = (
272 data["host"],
273 Timespan(begin=data["timespan_begin"], end=data["timespan_end"])
274 )
275 elif collectionType is CollectionType.CHAINED:
276 children = []
277 for child in data["children"]:
278 if not isinstance(child, str):
279 warnings.warn(
280 f"CHAINED collection {data['name']} includes restrictions on child "
281 "collection searches, which are no longer suppored and will be ignored."
282 )
283 # Old form with dataset type restrictions only,
284 # supported for backwards compatibility.
285 child, _ = child
286 children.append(child)
287 self.chains[data["name"]] = children
288 else:
289 self.collections[data["name"]] = collectionType
290 doc = data.get("doc")
291 if doc is not None:
292 self.collectionDocs[data["name"]] = doc
293 elif data["type"] == "run":
294 # Also support old form of saving a run with no extra info.
295 self.runs[data["name"]] = (None, Timespan(None, None))
296 elif data["type"] == "dataset_type":
297 self.datasetTypes.add(
298 DatasetType(data["name"], dimensions=data["dimensions"],
299 storageClass=data["storage_class"], universe=self.registry.dimensions,
300 isCalibration=data.get("is_calibration", False))
301 )
302 elif data["type"] == "dataset":
303 # Save raw dataset data for a second loop, so we can ensure we
304 # know about all dataset types first.
305 datasetData.append(data)
306 elif data["type"] == "associations":
307 collectionType = CollectionType.from_name(data["collection_type"])
308 if collectionType is CollectionType.TAGGED:
309 self.tagAssociations[data["collection"]].extend(data["dataset_ids"])
310 elif collectionType is CollectionType.CALIBRATION:
311 assocsByTimespan = self.calibAssociations[data["collection"]]
312 for d in data["validity_ranges"]:
313 if "timespan" in d:
314 assocsByTimespan[d["timespan"]] = d["dataset_ids"]
315 else:
316 # TODO: this is for backward compatibility, should
317 # be removed at some point.
318 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"]
319 else:
320 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
321 else:
322 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
323 # key is (dataset type name, run)
324 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list)
325 for data in datasetData:
326 datasetType = self.datasetTypes.get(data["dataset_type"])
327 if datasetType is None:
328 datasetType = self.registry.getDatasetType(data["dataset_type"])
329 self.datasets[data["dataset_type"], data["run"]].extend(
330 FileDataset(
331 d.get("path"),
332 [DatasetRef(datasetType, dataId, run=data["run"], id=refid)
333 for dataId, refid in zip(ensure_iterable(d["data_id"]),
334 ensure_iterable(d["dataset_id"]))],
335 formatter=doImportType(d.get("formatter")) if "formatter" in d else None
336 )
337 for d in data["records"]
338 )
340 def register(self) -> None:
341 # Docstring inherited from RepoImportBackend.register.
342 for datasetType in self.datasetTypes:
343 self.registry.registerDatasetType(datasetType)
344 for run in self.runs:
345 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
346 # No way to add extra run info to registry yet.
347 for collection, collection_type in self.collections.items():
348 self.registry.registerCollection(collection, collection_type,
349 doc=self.collectionDocs.get(collection))
350 for chain, children in self.chains.items():
351 self.registry.registerCollection(chain, CollectionType.CHAINED,
352 doc=self.collectionDocs.get(chain))
353 self.registry.setCollectionChain(chain, children)
355 def load(self, datastore: Optional[Datastore], *,
356 directory: Optional[str] = None, transfer: Optional[str] = None,
357 skip_dimensions: Optional[Set] = None,
358 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
359 reuseIds: bool = False) -> None:
360 # Docstring inherited from RepoImportBackend.load.
361 for element, dimensionRecords in self.dimensions.items():
362 if skip_dimensions and element in skip_dimensions:
363 continue
364 self.registry.insertDimensionData(element, *dimensionRecords)
365 # FileDatasets to ingest into the datastore (in bulk):
366 fileDatasets = []
367 for (datasetTypeName, run), records in self.datasets.items():
368 # Make a big flattened list of all data IDs and dataset_ids, while
369 # remembering slices that associate them with the FileDataset
370 # instances they came from.
371 datasets: List[DatasetRef] = []
372 dataset_ids: List[DatasetId] = []
373 slices = []
374 for fileDataset in records:
375 start = len(datasets)
376 datasets.extend(fileDataset.refs)
377 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore
378 stop = len(datasets)
379 slices.append(slice(start, stop))
380 # Insert all of those DatasetRefs at once.
381 # For now, we ignore the dataset_id we pulled from the file
382 # and just insert without one to get a new autoincrement value.
383 # Eventually (once we have origin in IDs) we'll preserve them.
384 resolvedRefs = self.registry._importDatasets(datasets, idGenerationMode=idGenerationMode,
385 reuseIds=reuseIds)
386 # Populate our dictionary that maps int dataset_id values from the
387 # export file to the new DatasetRefs
388 for fileId, ref in zip(dataset_ids, resolvedRefs):
389 self.refsByFileId[fileId] = ref
390 # Now iterate over the original records, and install the new
391 # resolved DatasetRefs to replace the unresolved ones as we
392 # reorganize the collection information.
393 for sliceForFileDataset, fileDataset in zip(slices, records):
394 fileDataset.refs = resolvedRefs[sliceForFileDataset]
395 if directory is not None:
396 fileDataset.path = ButlerURI(directory, forceDirectory=True).join(fileDataset.path)
397 fileDatasets.append(fileDataset)
398 # Ingest everything into the datastore at once.
399 if datastore is not None and fileDatasets:
400 datastore.ingest(*fileDatasets, transfer=transfer)
401 # Associate datasets with tagged collections.
402 for collection, dataset_ids in self.tagAssociations.items():
403 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
404 # Associate datasets with calibration collections.
405 for collection, idsByTimespan in self.calibAssociations.items():
406 for timespan, dataset_ids in idsByTimespan.items():
407 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)