Coverage for python/lsst/daf/butler/transfers/_yaml.py: 11%
185 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-14 09:22 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-14 09:22 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26import uuid
27import warnings
28from collections import defaultdict
29from datetime import datetime
30from typing import IO, TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Type
32import astropy.time
33import yaml
34from lsst.resources import ResourcePath
35from lsst.utils import doImportType
36from lsst.utils.iteration import ensure_iterable
38from ..core import (
39 DatasetAssociation,
40 DatasetId,
41 DatasetRef,
42 DatasetType,
43 Datastore,
44 DimensionElement,
45 DimensionRecord,
46 DimensionUniverse,
47 FileDataset,
48 Timespan,
49)
50from ..core.named import NamedValueSet
51from ..registry import CollectionType, Registry
52from ..registry.interfaces import (
53 ChainedCollectionRecord,
54 CollectionRecord,
55 DatasetIdGenEnum,
56 RunRecord,
57 VersionTuple,
58)
59from ..registry.versions import IncompatibleVersionError
60from ._interfaces import RepoExportBackend, RepoImportBackend
62if TYPE_CHECKING:
63 from lsst.resources import ResourcePathExpression
65EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2)
66"""Export format version.
68Files with a different major version or a newer minor version cannot be read by
69this version of the code.
70"""
73def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
74 """Generate YAML representation for UUID.
76 This produces a scalar node with a tag "!uuid" and value being a regular
77 string representation of UUID.
78 """
79 return dumper.represent_scalar("!uuid", str(data))
82def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> Optional[uuid.UUID]:
83 if node.value is not None:
84 return uuid.UUID(hex=node.value)
85 return None
88yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
89yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
92class YamlRepoExportBackend(RepoExportBackend):
93 """A repository export implementation that saves to a YAML file.
95 Parameters
96 ----------
97 stream
98 A writeable file-like object.
99 """
101 def __init__(self, stream: IO, universe: DimensionUniverse):
102 self.stream = stream
103 self.universe = universe
104 self.data: List[Dict[str, Any]] = []
106 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
107 # Docstring inherited from RepoExportBackend.saveDimensionData.
108 data_dicts = [record.toDict(splitTimespan=True) for record in data]
109 self.data.append(
110 {
111 "type": "dimension",
112 "element": element.name,
113 "records": data_dicts,
114 }
115 )
117 def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None:
118 # Docstring inherited from RepoExportBackend.saveCollections.
119 data: Dict[str, Any] = {
120 "type": "collection",
121 "collection_type": record.type.name,
122 "name": record.name,
123 }
124 if doc is not None:
125 data["doc"] = doc
126 if isinstance(record, RunRecord):
127 data["host"] = record.host
128 data["timespan_begin"] = record.timespan.begin
129 data["timespan_end"] = record.timespan.end
130 elif isinstance(record, ChainedCollectionRecord):
131 data["children"] = list(record.children)
132 self.data.append(data)
134 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
135 # Docstring inherited from RepoExportBackend.saveDatasets.
136 self.data.append(
137 {
138 "type": "dataset_type",
139 "name": datasetType.name,
140 "dimensions": [d.name for d in datasetType.dimensions],
141 "storage_class": datasetType.storageClass_name,
142 "is_calibration": datasetType.isCalibration(),
143 }
144 )
145 self.data.append(
146 {
147 "type": "dataset",
148 "dataset_type": datasetType.name,
149 "run": run,
150 "records": [
151 {
152 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
153 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
154 "path": dataset.path,
155 "formatter": dataset.formatter,
156 # TODO: look up and save other collections
157 }
158 for dataset in datasets
159 ],
160 }
161 )
163 def saveDatasetAssociations(
164 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation]
165 ) -> None:
166 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
167 if collectionType is CollectionType.TAGGED:
168 self.data.append(
169 {
170 "type": "associations",
171 "collection": collection,
172 "collection_type": collectionType.name,
173 "dataset_ids": [assoc.ref.id for assoc in associations],
174 }
175 )
176 elif collectionType is CollectionType.CALIBRATION:
177 idsByTimespan: Dict[Timespan, List[DatasetId]] = defaultdict(list)
178 for association in associations:
179 assert association.timespan is not None
180 assert association.ref.id is not None
181 idsByTimespan[association.timespan].append(association.ref.id)
182 self.data.append(
183 {
184 "type": "associations",
185 "collection": collection,
186 "collection_type": collectionType.name,
187 "validity_ranges": [
188 {
189 "timespan": timespan,
190 "dataset_ids": dataset_ids,
191 }
192 for timespan, dataset_ids in idsByTimespan.items()
193 ],
194 }
195 )
197 def finish(self) -> None:
198 # Docstring inherited from RepoExportBackend.
199 yaml.dump(
200 {
201 "description": "Butler Data Repository Export",
202 "version": str(EXPORT_FORMAT_VERSION),
203 "universe_version": self.universe.version,
204 "universe_namespace": self.universe.namespace,
205 "data": self.data,
206 },
207 stream=self.stream,
208 sort_keys=False,
209 )
212class YamlRepoImportBackend(RepoImportBackend):
213 """A repository import implementation that reads from a YAML file.
215 Parameters
216 ----------
217 stream
218 A readable file-like object.
219 registry : `Registry`
220 The registry datasets will be imported into. Only used to retreive
221 dataset types during construction; all write happen in `register`
222 and `load`.
223 """
225 def __init__(self, stream: IO, registry: Registry):
226 # We read the file fully and convert its contents to Python objects
227 # instead of loading incrementally so we can spot some problems early;
228 # because `register` can't be put inside a transaction, we'd rather not
229 # run that at all if there's going to be problem later in `load`.
230 wrapper = yaml.safe_load(stream)
231 if wrapper["version"] == 0:
232 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
233 # before we really tried to do versioning here.
234 fileVersion = VersionTuple(1, 0, 0)
235 else:
236 fileVersion = VersionTuple.fromString(wrapper["version"])
237 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
238 raise IncompatibleVersionError(
239 f"Cannot read repository export file with version={fileVersion} "
240 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
241 )
242 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
243 raise IncompatibleVersionError(
244 f"Cannot read repository export file with version={fileVersion} "
245 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
246 )
247 self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {}
248 self.chains: Dict[str, List[str]] = {}
249 self.collections: Dict[str, CollectionType] = {}
250 self.collectionDocs: Dict[str, str] = {}
251 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
252 self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
253 self.tagAssociations: Dict[str, List[DatasetId]] = defaultdict(list)
254 self.calibAssociations: Dict[str, Dict[Timespan, List[DatasetId]]] = defaultdict(dict)
255 self.refsByFileId: Dict[DatasetId, DatasetRef] = {}
256 self.registry: Registry = registry
258 universe_version = wrapper.get("universe_version", 0)
259 universe_namespace = wrapper.get("universe_namespace", "daf_butler")
261 # If this is data exported before the reorganization of visits
262 # and visit systems and that new schema is in use, some filtering
263 # will be needed. The entry in the visit dimension record will be
264 # silently dropped when visit is created but the
265 # visit_system_membership must be constructed.
266 migrate_visit_system = False
267 if (
268 universe_version < 2
269 and universe_namespace == "daf_butler"
270 and "visit_system_membership" in self.registry.dimensions
271 ):
272 migrate_visit_system = True
274 datasetData = []
275 for data in wrapper["data"]:
276 if data["type"] == "dimension":
277 # convert all datetime values to astropy
278 for record in data["records"]:
279 for key in record:
280 # Some older YAML files were produced with native
281 # YAML support for datetime, we support reading that
282 # data back. Newer conversion uses _AstropyTimeToYAML
283 # class with special YAML tag.
284 if isinstance(record[key], datetime):
285 record[key] = astropy.time.Time(record[key], scale="utc")
286 element = self.registry.dimensions[data["element"]]
287 RecordClass: Type[DimensionRecord] = element.RecordClass
288 self.dimensions[element].extend(RecordClass(**r) for r in data["records"])
290 if data["element"] == "visit" and migrate_visit_system:
291 # Must create the visit_system_membership records.
292 element = self.registry.dimensions["visit_system_membership"]
293 RecordClass = element.RecordClass
294 self.dimensions[element].extend(
295 RecordClass(instrument=r["instrument"], visit_system=r["visit_system"], visit=r["id"])
296 for r in data["records"]
297 )
299 elif data["type"] == "collection":
300 collectionType = CollectionType.from_name(data["collection_type"])
301 if collectionType is CollectionType.RUN:
302 self.runs[data["name"]] = (
303 data["host"],
304 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]),
305 )
306 elif collectionType is CollectionType.CHAINED:
307 children = []
308 for child in data["children"]:
309 if not isinstance(child, str):
310 warnings.warn(
311 f"CHAINED collection {data['name']} includes restrictions on child "
312 "collection searches, which are no longer suppored and will be ignored."
313 )
314 # Old form with dataset type restrictions only,
315 # supported for backwards compatibility.
316 child, _ = child
317 children.append(child)
318 self.chains[data["name"]] = children
319 else:
320 self.collections[data["name"]] = collectionType
321 doc = data.get("doc")
322 if doc is not None:
323 self.collectionDocs[data["name"]] = doc
324 elif data["type"] == "run":
325 # Also support old form of saving a run with no extra info.
326 self.runs[data["name"]] = (None, Timespan(None, None))
327 elif data["type"] == "dataset_type":
328 dimensions = data["dimensions"]
329 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions:
330 dimensions.remove("visit_system")
331 self.datasetTypes.add(
332 DatasetType(
333 data["name"],
334 dimensions=dimensions,
335 storageClass=data["storage_class"],
336 universe=self.registry.dimensions,
337 isCalibration=data.get("is_calibration", False),
338 )
339 )
340 elif data["type"] == "dataset":
341 # Save raw dataset data for a second loop, so we can ensure we
342 # know about all dataset types first.
343 datasetData.append(data)
344 elif data["type"] == "associations":
345 collectionType = CollectionType.from_name(data["collection_type"])
346 if collectionType is CollectionType.TAGGED:
347 self.tagAssociations[data["collection"]].extend(data["dataset_ids"])
348 elif collectionType is CollectionType.CALIBRATION:
349 assocsByTimespan = self.calibAssociations[data["collection"]]
350 for d in data["validity_ranges"]:
351 if "timespan" in d:
352 assocsByTimespan[d["timespan"]] = d["dataset_ids"]
353 else:
354 # TODO: this is for backward compatibility, should
355 # be removed at some point.
356 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"]
357 else:
358 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
359 else:
360 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
361 # key is (dataset type name, run)
362 self.datasets: Mapping[Tuple[str, str], List[FileDataset]] = defaultdict(list)
363 for data in datasetData:
364 datasetType = self.datasetTypes.get(data["dataset_type"])
365 if datasetType is None:
366 datasetType = self.registry.getDatasetType(data["dataset_type"])
367 self.datasets[data["dataset_type"], data["run"]].extend(
368 FileDataset(
369 d.get("path"),
370 [
371 DatasetRef(datasetType, dataId, run=data["run"], id=refid)
372 for dataId, refid in zip(
373 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"])
374 )
375 ],
376 formatter=doImportType(d.get("formatter")) if "formatter" in d else None,
377 )
378 for d in data["records"]
379 )
381 def register(self) -> None:
382 # Docstring inherited from RepoImportBackend.register.
383 for datasetType in self.datasetTypes:
384 self.registry.registerDatasetType(datasetType)
385 for run in self.runs:
386 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
387 # No way to add extra run info to registry yet.
388 for collection, collection_type in self.collections.items():
389 self.registry.registerCollection(
390 collection, collection_type, doc=self.collectionDocs.get(collection)
391 )
392 for chain, children in self.chains.items():
393 self.registry.registerCollection(
394 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain)
395 )
396 self.registry.setCollectionChain(chain, children)
398 def load(
399 self,
400 datastore: Optional[Datastore],
401 *,
402 directory: ResourcePathExpression | None = None,
403 transfer: Optional[str] = None,
404 skip_dimensions: Optional[Set] = None,
405 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE,
406 reuseIds: bool = False,
407 ) -> None:
408 # Docstring inherited from RepoImportBackend.load.
409 for element, dimensionRecords in self.dimensions.items():
410 if skip_dimensions and element in skip_dimensions:
411 continue
412 # Using skip_existing=True here assumes that the records in the
413 # database are either equivalent or at least preferable to the ones
414 # being imported. It'd be ideal to check that, but that would mean
415 # using syncDimensionData, which is not vectorized and is hence
416 # unacceptably slo.
417 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True)
418 # FileDatasets to ingest into the datastore (in bulk):
419 fileDatasets = []
420 for (datasetTypeName, run), records in self.datasets.items():
421 # Make a big flattened list of all data IDs and dataset_ids, while
422 # remembering slices that associate them with the FileDataset
423 # instances they came from.
424 datasets: List[DatasetRef] = []
425 dataset_ids: List[DatasetId] = []
426 slices = []
427 for fileDataset in records:
428 start = len(datasets)
429 datasets.extend(fileDataset.refs)
430 dataset_ids.extend(ref.id for ref in fileDataset.refs) # type: ignore
431 stop = len(datasets)
432 slices.append(slice(start, stop))
433 # Insert all of those DatasetRefs at once.
434 # For now, we ignore the dataset_id we pulled from the file
435 # and just insert without one to get a new autoincrement value.
436 # Eventually (once we have origin in IDs) we'll preserve them.
437 resolvedRefs = self.registry._importDatasets(
438 datasets, idGenerationMode=idGenerationMode, reuseIds=reuseIds
439 )
440 # Populate our dictionary that maps int dataset_id values from the
441 # export file to the new DatasetRefs
442 for fileId, ref in zip(dataset_ids, resolvedRefs):
443 self.refsByFileId[fileId] = ref
444 # Now iterate over the original records, and install the new
445 # resolved DatasetRefs to replace the unresolved ones as we
446 # reorganize the collection information.
447 for sliceForFileDataset, fileDataset in zip(slices, records):
448 fileDataset.refs = resolvedRefs[sliceForFileDataset]
449 if directory is not None:
450 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path)
451 fileDatasets.append(fileDataset)
452 # Ingest everything into the datastore at once.
453 if datastore is not None and fileDatasets:
454 datastore.ingest(*fileDatasets, transfer=transfer)
455 # Associate datasets with tagged collections.
456 for collection, dataset_ids in self.tagAssociations.items():
457 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
458 # Associate datasets with calibration collections.
459 for collection, idsByTimespan in self.calibAssociations.items():
460 for timespan, dataset_ids in idsByTimespan.items():
461 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)