Coverage for python/lsst/daf/butler/transfers/_yaml.py: 12%
185 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:30 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-23 09:30 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26import uuid
27import warnings
28from collections import defaultdict
29from collections.abc import Iterable, Mapping
30from datetime import datetime
31from typing import IO, TYPE_CHECKING, Any
33import astropy.time
34import yaml
35from lsst.resources import ResourcePath
36from lsst.utils import doImportType
37from lsst.utils.iteration import ensure_iterable
39from ..core import (
40 DatasetAssociation,
41 DatasetId,
42 DatasetRef,
43 DatasetType,
44 Datastore,
45 DimensionElement,
46 DimensionRecord,
47 DimensionUniverse,
48 FileDataset,
49 Timespan,
50)
51from ..core.named import NamedValueSet
52from ..registry import CollectionType, Registry
53from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord, VersionTuple
54from ..registry.versions import IncompatibleVersionError
55from ._interfaces import RepoExportBackend, RepoImportBackend
57if TYPE_CHECKING:
58 from lsst.resources import ResourcePathExpression
60EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2)
61"""Export format version.
63Files with a different major version or a newer minor version cannot be read by
64this version of the code.
65"""
68def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
69 """Generate YAML representation for UUID.
71 This produces a scalar node with a tag "!uuid" and value being a regular
72 string representation of UUID.
73 """
74 return dumper.represent_scalar("!uuid", str(data))
77def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> uuid.UUID | None:
78 if node.value is not None:
79 return uuid.UUID(hex=node.value)
80 return None
83yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
84yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
87class YamlRepoExportBackend(RepoExportBackend):
88 """A repository export implementation that saves to a YAML file.
90 Parameters
91 ----------
92 stream
93 A writeable file-like object.
94 """
96 def __init__(self, stream: IO, universe: DimensionUniverse):
97 self.stream = stream
98 self.universe = universe
99 self.data: list[dict[str, Any]] = []
101 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
102 # Docstring inherited from RepoExportBackend.saveDimensionData.
103 data_dicts = [record.toDict(splitTimespan=True) for record in data]
104 self.data.append(
105 {
106 "type": "dimension",
107 "element": element.name,
108 "records": data_dicts,
109 }
110 )
112 def saveCollection(self, record: CollectionRecord, doc: str | None) -> None:
113 # Docstring inherited from RepoExportBackend.saveCollections.
114 data: dict[str, Any] = {
115 "type": "collection",
116 "collection_type": record.type.name,
117 "name": record.name,
118 }
119 if doc is not None:
120 data["doc"] = doc
121 if isinstance(record, RunRecord):
122 data["host"] = record.host
123 data["timespan_begin"] = record.timespan.begin
124 data["timespan_end"] = record.timespan.end
125 elif isinstance(record, ChainedCollectionRecord):
126 data["children"] = list(record.children)
127 self.data.append(data)
129 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
130 # Docstring inherited from RepoExportBackend.saveDatasets.
131 self.data.append(
132 {
133 "type": "dataset_type",
134 "name": datasetType.name,
135 "dimensions": [d.name for d in datasetType.dimensions],
136 "storage_class": datasetType.storageClass_name,
137 "is_calibration": datasetType.isCalibration(),
138 }
139 )
140 self.data.append(
141 {
142 "type": "dataset",
143 "dataset_type": datasetType.name,
144 "run": run,
145 "records": [
146 {
147 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
148 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
149 "path": dataset.path,
150 "formatter": dataset.formatter,
151 # TODO: look up and save other collections
152 }
153 for dataset in datasets
154 ],
155 }
156 )
158 def saveDatasetAssociations(
159 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation]
160 ) -> None:
161 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
162 if collectionType is CollectionType.TAGGED:
163 self.data.append(
164 {
165 "type": "associations",
166 "collection": collection,
167 "collection_type": collectionType.name,
168 "dataset_ids": [assoc.ref.id for assoc in associations],
169 }
170 )
171 elif collectionType is CollectionType.CALIBRATION:
172 idsByTimespan: dict[Timespan, list[DatasetId]] = defaultdict(list)
173 for association in associations:
174 assert association.timespan is not None
175 idsByTimespan[association.timespan].append(association.ref.id)
176 self.data.append(
177 {
178 "type": "associations",
179 "collection": collection,
180 "collection_type": collectionType.name,
181 "validity_ranges": [
182 {
183 "timespan": timespan,
184 "dataset_ids": dataset_ids,
185 }
186 for timespan, dataset_ids in idsByTimespan.items()
187 ],
188 }
189 )
191 def finish(self) -> None:
192 # Docstring inherited from RepoExportBackend.
193 yaml.dump(
194 {
195 "description": "Butler Data Repository Export",
196 "version": str(EXPORT_FORMAT_VERSION),
197 "universe_version": self.universe.version,
198 "universe_namespace": self.universe.namespace,
199 "data": self.data,
200 },
201 stream=self.stream,
202 sort_keys=False,
203 )
206class YamlRepoImportBackend(RepoImportBackend):
207 """A repository import implementation that reads from a YAML file.
209 Parameters
210 ----------
211 stream
212 A readable file-like object.
213 registry : `Registry`
214 The registry datasets will be imported into. Only used to retreive
215 dataset types during construction; all write happen in `register`
216 and `load`.
217 """
219 def __init__(self, stream: IO, registry: Registry):
220 # We read the file fully and convert its contents to Python objects
221 # instead of loading incrementally so we can spot some problems early;
222 # because `register` can't be put inside a transaction, we'd rather not
223 # run that at all if there's going to be problem later in `load`.
224 wrapper = yaml.safe_load(stream)
225 if wrapper["version"] == 0:
226 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
227 # before we really tried to do versioning here.
228 fileVersion = VersionTuple(1, 0, 0)
229 else:
230 fileVersion = VersionTuple.fromString(wrapper["version"])
231 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
232 raise IncompatibleVersionError(
233 f"Cannot read repository export file with version={fileVersion} "
234 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
235 )
236 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
237 raise IncompatibleVersionError(
238 f"Cannot read repository export file with version={fileVersion} "
239 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
240 )
241 self.runs: dict[str, tuple[str | None, Timespan]] = {}
242 self.chains: dict[str, list[str]] = {}
243 self.collections: dict[str, CollectionType] = {}
244 self.collectionDocs: dict[str, str] = {}
245 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
246 self.dimensions: Mapping[DimensionElement, list[DimensionRecord]] = defaultdict(list)
247 self.tagAssociations: dict[str, list[DatasetId]] = defaultdict(list)
248 self.calibAssociations: dict[str, dict[Timespan, list[DatasetId]]] = defaultdict(dict)
249 self.refsByFileId: dict[DatasetId, DatasetRef] = {}
250 self.registry: Registry = registry
252 universe_version = wrapper.get("universe_version", 0)
253 universe_namespace = wrapper.get("universe_namespace", "daf_butler")
255 # If this is data exported before the reorganization of visits
256 # and visit systems and that new schema is in use, some filtering
257 # will be needed. The entry in the visit dimension record will be
258 # silently dropped when visit is created but the
259 # visit_system_membership must be constructed.
260 migrate_visit_system = False
261 if (
262 universe_version < 2
263 and universe_namespace == "daf_butler"
264 and "visit_system_membership" in self.registry.dimensions
265 ):
266 migrate_visit_system = True
268 datasetData = []
269 for data in wrapper["data"]:
270 if data["type"] == "dimension":
271 # convert all datetime values to astropy
272 for record in data["records"]:
273 for key in record:
274 # Some older YAML files were produced with native
275 # YAML support for datetime, we support reading that
276 # data back. Newer conversion uses _AstropyTimeToYAML
277 # class with special YAML tag.
278 if isinstance(record[key], datetime):
279 record[key] = astropy.time.Time(record[key], scale="utc")
280 element = self.registry.dimensions[data["element"]]
281 RecordClass: type[DimensionRecord] = element.RecordClass
282 self.dimensions[element].extend(RecordClass(**r) for r in data["records"])
284 if data["element"] == "visit" and migrate_visit_system:
285 # Must create the visit_system_membership records.
286 element = self.registry.dimensions["visit_system_membership"]
287 RecordClass = element.RecordClass
288 self.dimensions[element].extend(
289 RecordClass(instrument=r["instrument"], visit_system=r["visit_system"], visit=r["id"])
290 for r in data["records"]
291 )
293 elif data["type"] == "collection":
294 collectionType = CollectionType.from_name(data["collection_type"])
295 if collectionType is CollectionType.RUN:
296 self.runs[data["name"]] = (
297 data["host"],
298 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]),
299 )
300 elif collectionType is CollectionType.CHAINED:
301 children = []
302 for child in data["children"]:
303 if not isinstance(child, str):
304 warnings.warn(
305 f"CHAINED collection {data['name']} includes restrictions on child "
306 "collection searches, which are no longer suppored and will be ignored."
307 )
308 # Old form with dataset type restrictions only,
309 # supported for backwards compatibility.
310 child, _ = child
311 children.append(child)
312 self.chains[data["name"]] = children
313 else:
314 self.collections[data["name"]] = collectionType
315 doc = data.get("doc")
316 if doc is not None:
317 self.collectionDocs[data["name"]] = doc
318 elif data["type"] == "run":
319 # Also support old form of saving a run with no extra info.
320 self.runs[data["name"]] = (None, Timespan(None, None))
321 elif data["type"] == "dataset_type":
322 dimensions = data["dimensions"]
323 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions:
324 dimensions.remove("visit_system")
325 self.datasetTypes.add(
326 DatasetType(
327 data["name"],
328 dimensions=dimensions,
329 storageClass=data["storage_class"],
330 universe=self.registry.dimensions,
331 isCalibration=data.get("is_calibration", False),
332 )
333 )
334 elif data["type"] == "dataset":
335 # Save raw dataset data for a second loop, so we can ensure we
336 # know about all dataset types first.
337 datasetData.append(data)
338 elif data["type"] == "associations":
339 collectionType = CollectionType.from_name(data["collection_type"])
340 if collectionType is CollectionType.TAGGED:
341 self.tagAssociations[data["collection"]].extend(data["dataset_ids"])
342 elif collectionType is CollectionType.CALIBRATION:
343 assocsByTimespan = self.calibAssociations[data["collection"]]
344 for d in data["validity_ranges"]:
345 if "timespan" in d:
346 assocsByTimespan[d["timespan"]] = d["dataset_ids"]
347 else:
348 # TODO: this is for backward compatibility, should
349 # be removed at some point.
350 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = d["dataset_ids"]
351 else:
352 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
353 else:
354 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
355 # key is (dataset type name, run)
356 self.datasets: Mapping[tuple[str, str], list[FileDataset]] = defaultdict(list)
357 for data in datasetData:
358 datasetType = self.datasetTypes.get(data["dataset_type"])
359 if datasetType is None:
360 datasetType = self.registry.getDatasetType(data["dataset_type"])
361 self.datasets[data["dataset_type"], data["run"]].extend(
362 FileDataset(
363 d.get("path"),
364 [
365 DatasetRef(datasetType, dataId, run=data["run"], id=refid)
366 for dataId, refid in zip(
367 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"])
368 )
369 ],
370 formatter=doImportType(d.get("formatter")) if "formatter" in d else None,
371 )
372 for d in data["records"]
373 )
375 def register(self) -> None:
376 # Docstring inherited from RepoImportBackend.register.
377 for datasetType in self.datasetTypes:
378 self.registry.registerDatasetType(datasetType)
379 for run in self.runs:
380 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
381 # No way to add extra run info to registry yet.
382 for collection, collection_type in self.collections.items():
383 self.registry.registerCollection(
384 collection, collection_type, doc=self.collectionDocs.get(collection)
385 )
386 for chain, children in self.chains.items():
387 self.registry.registerCollection(
388 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain)
389 )
390 self.registry.setCollectionChain(chain, children)
392 def load(
393 self,
394 datastore: Datastore | None,
395 *,
396 directory: ResourcePathExpression | None = None,
397 transfer: str | None = None,
398 skip_dimensions: set | None = None,
399 ) -> None:
400 # Docstring inherited from RepoImportBackend.load.
401 for element, dimensionRecords in self.dimensions.items():
402 if skip_dimensions and element in skip_dimensions:
403 continue
404 # Using skip_existing=True here assumes that the records in the
405 # database are either equivalent or at least preferable to the ones
406 # being imported. It'd be ideal to check that, but that would mean
407 # using syncDimensionData, which is not vectorized and is hence
408 # unacceptably slo.
409 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True)
410 # FileDatasets to ingest into the datastore (in bulk):
411 fileDatasets = []
412 for (datasetTypeName, run), records in self.datasets.items():
413 # Make a big flattened list of all data IDs and dataset_ids, while
414 # remembering slices that associate them with the FileDataset
415 # instances they came from.
416 datasets: list[DatasetRef] = []
417 dataset_ids: list[DatasetId] = []
418 slices = []
419 for fileDataset in records:
420 start = len(datasets)
421 datasets.extend(fileDataset.refs)
422 dataset_ids.extend(ref.id for ref in fileDataset.refs)
423 stop = len(datasets)
424 slices.append(slice(start, stop))
425 # Insert all of those DatasetRefs at once.
426 # For now, we ignore the dataset_id we pulled from the file
427 # and just insert without one to get a new autoincrement value.
428 # Eventually (once we have origin in IDs) we'll preserve them.
429 resolvedRefs = self.registry._importDatasets(datasets)
430 # Populate our dictionary that maps int dataset_id values from the
431 # export file to the new DatasetRefs
432 for fileId, ref in zip(dataset_ids, resolvedRefs):
433 self.refsByFileId[fileId] = ref
434 # Now iterate over the original records, and install the new
435 # resolved DatasetRefs to replace the unresolved ones as we
436 # reorganize the collection information.
437 for sliceForFileDataset, fileDataset in zip(slices, records):
438 fileDataset.refs = resolvedRefs[sliceForFileDataset]
439 if directory is not None:
440 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path)
441 fileDatasets.append(fileDataset)
442 # Ingest everything into the datastore at once.
443 if datastore is not None and fileDatasets:
444 datastore.ingest(*fileDatasets, transfer=transfer)
445 # Associate datasets with tagged collections.
446 for collection, dataset_ids in self.tagAssociations.items():
447 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
448 # Associate datasets with calibration collections.
449 for collection, idsByTimespan in self.calibAssociations.items():
450 for timespan, dataset_ids in idsByTimespan.items():
451 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)