Coverage for python/lsst/daf/butler/transfers/_yaml.py: 11%
185 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-01 02:05 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2023-02-01 02:05 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26import uuid
27import warnings
28from collections import defaultdict
29from datetime import datetime
30from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Type
32import astropy.time
33import yaml
34from lsst.resources import ResourcePath
35from lsst.utils import doImportType
36from lsst.utils.iteration import ensure_iterable
38from ..core import (
39 DatasetAssociation,
40 DatasetId,
41 DatasetRef,
42 DatasetType,
43 Datastore,
44 DimensionElement,
45 DimensionRecord,
46 DimensionUniverse,
47 FileDataset,
48 Timespan,
49)
50from ..core.named import NamedValueSet
51from ..registry import CollectionType, Registry
52from ..registry.interfaces import (
53 ChainedCollectionRecord,
54 CollectionRecord,
55 DatasetIdGenEnum,
56 RunRecord,
57 VersionTuple,
58)
59from ..registry.versions import IncompatibleVersionError
60from ._interfaces import RepoExportBackend, RepoImportBackend
62EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2)
63"""Export format version.
65Files with a different major version or a newer minor version cannot be read by
66this version of the code.
67"""
70def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
71 """Generate YAML representation for UUID.
73 This produces a scalar node with a tag "!uuid" and value being a regular
74 string representation of UUID.
75 """
76 return dumper.represent_scalar("!uuid", str(data))
79def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> Optional[uuid.UUID]:
80 if node.value is not None:
81 return uuid.UUID(hex=node.value)
82 return None
85yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
86yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
89class YamlRepoExportBackend(RepoExportBackend):
90 """A repository export implementation that saves to a YAML file.
92 Parameters
93 ----------
94 stream
95 A writeable file-like object.
96 """
98 def __init__(self, stream: IO, universe: DimensionUniverse):
99 self.stream = stream
100 self.universe = universe
101 self.data: List[Dict[str, Any]] = []
103 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
104 # Docstring inherited from RepoExportBackend.saveDimensionData.
105 data_dicts = [record.toDict(splitTimespan=True) for record in data]
106 self.data.append(
107 {
108 "type": "dimension",
109 "element": element.name,
110 "records": data_dicts,
111 }
112 )
114 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None:
115 # Docstring inherited from RepoExportBackend.saveCollections.
116 data: Dict[str, Any] = {
117 "type": "collection",
118 "collection_type": record.type.name,
119 "name": record.name,
120 }
121 if doc is not None:
122 data["doc"] = doc
123 if isinstance(record, RunRecord):
124 data["host"] = record.host
125 data["timespan_begin"] = record.timespan.begin
126 data["timespan_end"] = record.timespan.end
127 elif isinstance(record, ChainedCollectionRecord):
128 data["children"] = list(record.children)
129 self.data.append(data)
131 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
132 # Docstring inherited from RepoExportBackend.saveDatasets.
133 self.data.append(
134 {
135 "type": "dataset_type",
136 "name": datasetType.name,
137 "dimensions": [d.name for d in datasetType.dimensions],
138 "storage_class": datasetType.storageClass_name,
139 "is_calibration": datasetType.isCalibration(),
140 }
141 )
142 self.data.append(
143 {
144 "type": "dataset",
145 "dataset_type": datasetType.name,
146 "run": run,
147 "records": [
148 {
149 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
150 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
151 "path": dataset.path,
152 "formatter": dataset.formatter,
153 # TODO: look up and save other collections
154 }
155 for dataset in datasets
156 ],
157 }
158 )
160 def saveDatasetAssociations(
161 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation]
162 ) -> None:
163 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
164 if collectionType is CollectionType.TAGGED:
165 self.data.append(
166 {
167 "type": "associations",
168 "collection": collection,
169 "collection_type": collectionType.name,
170 "dataset_ids": [assoc.ref.id for assoc in associations],
171 }
172 )
173 elif collectionType is CollectionType.CALIBRATION:
174 idsByTimespan: Dict[Timespan, List[DatasetId]] = defaultdict(list)
175 for association in associations:
176 assert association.timespan is not None
177 assert association.ref.id is not None
178 idsByTimespan[association.timespan].append(association.ref.id)
179 self.data.append(
180 {
181 "type": "associations",
182 "collection": collection,
183 "collection_type": collectionType.name,
184 "validity_ranges": [
185 {
186 "timespan": timespan,
187 "dataset_ids": dataset_ids,
188 }
189 for timespan, dataset_ids in idsByTimespan.items()
190 ],
191 }
192 )
194 def finish(self) -> None:
195 # Docstring inherited from RepoExportBackend.
196 yaml.dump(
197 {
198 "description": "Butler Data Repository Export",
199 "version": str(EXPORT_FORMAT_VERSION),
200 "universe_version": self.universe.version,
201 "universe_namespace": self.universe.namespace,
202 "data": self.data,
203 },
204 stream=self.stream,
205 sort_keys=False,
206 )
209class YamlRepoImportBackend(RepoImportBackend):
210 """A repository import implementation that reads from a YAML file.
212 Parameters
213 ----------
214 stream
215 A readable file-like object.
216 registry : `Registry`
217 The registry datasets will be imported into. Only used to retreive
218 dataset types during construction; all write happen in `register`
219 and `load`.
220 """
222 def __init__(self, stream: IO, registry: Registry):
223 # We read the file fully and convert its contents to Python objects
224 # instead of loading incrementally so we can spot some problems early;
225 # because `register` can't be put inside a transaction, we'd rather not
226 # run that at all if there's going to be problem later in `load`.
227 wrapper = yaml.safe_load(stream)
228 if wrapper["version"] == 0:
229 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
230 # before we really tried to do versioning here.
231 fileVersion = VersionTuple(1, 0, 0)
232 else:
233 fileVersion = VersionTuple.fromString(wrapper["version"])
234 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
235 raise IncompatibleVersionError(
236 f"Cannot read repository export file with version={fileVersion} "
237 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
238 )
239 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
240 raise IncompatibleVersionError(
241 f"Cannot read repository export file with version={fileVersion} "
242 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
243 )
244 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {}
245 self.chains: Dict[str, List[str]] = {}
246 self.collections: Dict[str, CollectionType] = {}
247 self.collectionDocs: Dict[str, str] = {}
248 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
249 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
250 self.tagAssociations: Dict[str, List[DatasetId]] = defaultdict(list)
251 self.calibAssociations: Dict[str, Dict[Timespan, List[DatasetId]]] = defaultdict(dict)
252 self.refsByFileId: Dict[DatasetId, DatasetRef] = {}
253 self.registry: Registry = registry
255 universe_version = wrapper.get("universe_version", 0)
256 universe_namespace = wrapper.get("universe_namespace", "daf_butler")
258 # If this is data exported before the reorganization of visits
259 # and visit systems and that new schema is in use, some filtering
260 # will be needed. The entry in the visit dimension record will be
261 # silently dropped when visit is created but the
262 # visit_system_membership must be constructed.
263 migrate_visit_system = False
264 if (
265 universe_version < 2
266 and universe_namespace == "daf_butler"
267 and "visit_system_membership" in self.registry.dimensions
268 ):
269 migrate_visit_system = True
271 datasetData = []
272 for data in wrapper["data"]:
273 if data["type"] == "dimension":
274 # convert all datetime values to astropy
275 for record in data["records"]:
276 for key in record:
277 # Some older YAML files were produced with native
278 # YAML support for datetime, we support reading that
279 # data back. Newer conversion uses _AstropyTimeToYAML
280 # class with special YAML tag.
281 if isinstance(record[key], datetime):
282 record[key] = astropy.time.Time(record[key], scale="utc")
283 element = self.registry.dimensions[data["element"]]
284 RecordClass: Type[DimensionRecord] = element.RecordClass
285 self.dimensions[element].extend(RecordClass(**r) for r in data["records"])
287 if data["element"] == "visit" and migrate_visit_system:
288 # Must create the visit_system_membership records.
289 element = self.registry.dimensions["visit_system_membership"]
290 RecordClass = element.RecordClass
291 self.dimensions[element].extend(
292 RecordClass(instrument=r["instrument"], visit_system=r["visit_system"], visit=r["id"])
293 for r in data["records"]
294 )
296 elif data["type"] == "collection":
297 collectionType = CollectionType.from_name(data["collection_type"])
298 if collectionType is CollectionType.RUN:
299 self.runs[data["name"]] = (
300 data["host"],
301 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]),
302 )
303 elif collectionType is CollectionType.CHAINED:
304 children = []
305 for child in data["children"]:
306 if not isinstance(child, str):
307 warnings.warn(
308 f"CHAINED collection {data['name']} includes restrictions on child "
309 "collection searches, which are no longer suppored and will be ignored."
310 )
311 # Old form with dataset type restrictions only,
312 # supported for backwards compatibility.
313 child, _ = child
314 children.append(child)
315 self.chains[data["name"]] = children
316 else:
317 self.collections[data["name"]] = collectionType
318 doc = data.get("doc")
319 if doc is not None:
320 self.collectionDocs[data["name"]] = doc
321 elif data["type"] == "run":
322 # Also support old form of saving a run with no extra info.
323 self.runs[data["name"]] = (None, Timespan(None, None))
324 elif data["type"] == "dataset_type":
325 dimensions = data["dimensions"]
326 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions:
327 dimensions.remove("visit_system")
328 self.datasetTypes.add(
329 DatasetType(
330 data["name"],
331 dimensions=dimensions,
332 storageClass=data["storage_class"],
333 universe=self.registry.dimensions,
334 isCalibration=data.get("is_calibration", False),
335 )
336 )
337 elif data["type"] == "dataset":
338 # Save raw dataset data for a second loop, so we can ensure we
339 # know about all dataset types first.
340 datasetData.append(data)
341 elif data["type"] == "associations":
342 collectionType = CollectionType.from_name(data["collection_type"])
343 if collectionType is CollectionType.TAGGED:
344 self.tagAssociations[data["collection"]].extend(data["dataset_ids"])
345 elif collectionType is CollectionType.CALIBRATION:
346 assocsByTimespan = self.calibAssociations[data["collection"]]
347 for d in data["validity_ranges"]:
348 if "timespan" in d:
349 assocsByTimespan[d["timespan"]] = d["dataset_ids"]
350 else:
351 # TODO: this is for backward compatibility, should
352 # be removed at some point.
353 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"]
354 else:
355 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
356 else:
357 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
358 # key is (dataset type name, run)
359 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list)
360 for data in datasetData:
361 datasetType = self.datasetTypes.get(data["dataset_type"])
362 if datasetType is None:
363 datasetType = self.registry.getDatasetType(data["dataset_type"])
364 self.datasets[data["dataset_type"], data["run"]].extend(
365 FileDataset(
366 d.get("path"),
367 [
368 DatasetRef(datasetType, dataId, run=data["run"], id=refid)
369 for dataId, refid in zip(
370 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"])
371 )
372 ],
373 formatter=doImportType(d.get("formatter")) if "formatter" in d else None,
374 )
375 for d in data["records"]
376 )
378 def register(self) -> None:
379 # Docstring inherited from RepoImportBackend.register.
380 for datasetType in self.datasetTypes:
381 self.registry.registerDatasetType(datasetType)
382 for run in self.runs:
383 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
384 # No way to add extra run info to registry yet.
385 for collection, collection_type in self.collections.items():
386 self.registry.registerCollection(
387 collection, collection_type, doc=self.collectionDocs.get(collection)
388 )
389 for chain, children in self.chains.items():
390 self.registry.registerCollection(
391 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain)
392 )
393 self.registry.setCollectionChain(chain, children)
395 def load(
396 self,
397 datastore: Optional[Datastore],
398 *,
399 directory: Optional[str] = None,
400 transfer: Optional[str] = None,
401 skip_dimensions: Optional[Set] = None,
402 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
403 reuseIds: bool = False,
404 ) -> None:
405 # Docstring inherited from RepoImportBackend.load.
406 for element, dimensionRecords in self.dimensions.items():
407 if skip_dimensions and element in skip_dimensions:
408 continue
409 # Using skip_existing=True here assumes that the records in the
410 # database are either equivalent or at least preferable to the ones
411 # being imported. It'd be ideal to check that, but that would mean
412 # using syncDimensionData, which is not vectorized and is hence
413 # unacceptably slo.
414 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True)
415 # FileDatasets to ingest into the datastore (in bulk):
416 fileDatasets = []
417 for (datasetTypeName, run), records in self.datasets.items():
418 # Make a big flattened list of all data IDs and dataset_ids, while
419 # remembering slices that associate them with the FileDataset
420 # instances they came from.
421 datasets: List[DatasetRef] = []
422 dataset_ids: List[DatasetId] = []
423 slices = []
424 for fileDataset in records:
425 start = len(datasets)
426 datasets.extend(fileDataset.refs)
427 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore
428 stop = len(datasets)
429 slices.append(slice(start, stop))
430 # Insert all of those DatasetRefs at once.
431 # For now, we ignore the dataset_id we pulled from the file
432 # and just insert without one to get a new autoincrement value.
433 # Eventually (once we have origin in IDs) we'll preserve them.
434 resolvedRefs = self.registry._importDatasets(
435 datasets, idGenerationMode=idGenerationMode, reuseIds=reuseIds
436 )
437 # Populate our dictionary that maps int dataset_id values from the
438 # export file to the new DatasetRefs
439 for fileId, ref in zip(dataset_ids, resolvedRefs):
440 self.refsByFileId[fileId] = ref
441 # Now iterate over the original records, and install the new
442 # resolved DatasetRefs to replace the unresolved ones as we
443 # reorganize the collection information.
444 for sliceForFileDataset, fileDataset in zip(slices, records):
445 fileDataset.refs = resolvedRefs[sliceForFileDataset]
446 if directory is not None:
447 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path)
448 fileDatasets.append(fileDataset)
449 # Ingest everything into the datastore at once.
450 if datastore is not None and fileDatasets:
451 datastore.ingest(*fileDatasets, transfer=transfer)
452 # Associate datasets with tagged collections.
453 for collection, dataset_ids in self.tagAssociations.items():
454 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
455 # Associate datasets with calibration collections.
456 for collection, idsByTimespan in self.calibAssociations.items():
457 for timespan, dataset_ids in idsByTimespan.items():
458 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)