Coverage for python/lsst/daf/butler/transfers/_yaml.py: 13%
193 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-25 15:14 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
26import uuid
27import warnings
28from collections import UserDict, defaultdict
29from collections.abc import Iterable, Mapping
30from datetime import datetime
31from typing import IO, TYPE_CHECKING, Any
33import astropy.time
34import yaml
35from lsst.resources import ResourcePath
36from lsst.utils import doImportType
37from lsst.utils.introspection import find_outside_stacklevel
38from lsst.utils.iteration import ensure_iterable
40from ..core import (
41 DatasetAssociation,
42 DatasetId,
43 DatasetRef,
44 DatasetType,
45 Datastore,
46 DimensionElement,
47 DimensionRecord,
48 DimensionUniverse,
49 FileDataset,
50 Timespan,
51)
52from ..core.named import NamedValueSet
53from ..registry import CollectionType, Registry
54from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord, VersionTuple
55from ..registry.versions import IncompatibleVersionError
56from ._interfaces import RepoExportBackend, RepoImportBackend
58if TYPE_CHECKING:
59 from lsst.resources import ResourcePathExpression
61EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2)
62"""Export format version.
64Files with a different major version or a newer minor version cannot be read by
65this version of the code.
66"""
69class _RefMapper(UserDict[int, uuid.UUID]):
70 """Create a local dict subclass which creates new deterministic UUID for
71 missing keys.
72 """
74 _namespace = uuid.UUID("4d4851f4-2890-4d41-8779-5f38a3f5062b")
76 def __missing__(self, key: int) -> uuid.UUID:
77 newUUID = uuid.uuid3(namespace=self._namespace, name=str(key))
78 self[key] = newUUID
79 return newUUID
82_refIntId2UUID = _RefMapper()
85def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
86 """Generate YAML representation for UUID.
88 This produces a scalar node with a tag "!uuid" and value being a regular
89 string representation of UUID.
90 """
91 return dumper.represent_scalar("!uuid", str(data))
94def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> uuid.UUID | None:
95 if node.value is not None:
96 return uuid.UUID(hex=node.value)
97 return None
100yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
101yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
104class YamlRepoExportBackend(RepoExportBackend):
105 """A repository export implementation that saves to a YAML file.
107 Parameters
108 ----------
109 stream
110 A writeable file-like object.
111 """
113 def __init__(self, stream: IO, universe: DimensionUniverse):
114 self.stream = stream
115 self.universe = universe
116 self.data: list[dict[str, Any]] = []
118 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
119 # Docstring inherited from RepoExportBackend.saveDimensionData.
120 data_dicts = [record.toDict(splitTimespan=True) for record in data]
121 self.data.append(
122 {
123 "type": "dimension",
124 "element": element.name,
125 "records": data_dicts,
126 }
127 )
129 def saveCollection(self, record: CollectionRecord, doc: str | None) -> None:
130 # Docstring inherited from RepoExportBackend.saveCollections.
131 data: dict[str, Any] = {
132 "type": "collection",
133 "collection_type": record.type.name,
134 "name": record.name,
135 }
136 if doc is not None:
137 data["doc"] = doc
138 if isinstance(record, RunRecord):
139 data["host"] = record.host
140 data["timespan_begin"] = record.timespan.begin
141 data["timespan_end"] = record.timespan.end
142 elif isinstance(record, ChainedCollectionRecord):
143 data["children"] = list(record.children)
144 self.data.append(data)
146 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
147 # Docstring inherited from RepoExportBackend.saveDatasets.
148 self.data.append(
149 {
150 "type": "dataset_type",
151 "name": datasetType.name,
152 "dimensions": [d.name for d in datasetType.dimensions],
153 "storage_class": datasetType.storageClass_name,
154 "is_calibration": datasetType.isCalibration(),
155 }
156 )
157 self.data.append(
158 {
159 "type": "dataset",
160 "dataset_type": datasetType.name,
161 "run": run,
162 "records": [
163 {
164 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
165 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
166 "path": dataset.path,
167 "formatter": dataset.formatter,
168 # TODO: look up and save other collections
169 }
170 for dataset in datasets
171 ],
172 }
173 )
175 def saveDatasetAssociations(
176 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation]
177 ) -> None:
178 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
179 if collectionType is CollectionType.TAGGED:
180 self.data.append(
181 {
182 "type": "associations",
183 "collection": collection,
184 "collection_type": collectionType.name,
185 "dataset_ids": [assoc.ref.id for assoc in associations],
186 }
187 )
188 elif collectionType is CollectionType.CALIBRATION:
189 idsByTimespan: dict[Timespan, list[DatasetId]] = defaultdict(list)
190 for association in associations:
191 assert association.timespan is not None
192 idsByTimespan[association.timespan].append(association.ref.id)
193 self.data.append(
194 {
195 "type": "associations",
196 "collection": collection,
197 "collection_type": collectionType.name,
198 "validity_ranges": [
199 {
200 "timespan": timespan,
201 "dataset_ids": dataset_ids,
202 }
203 for timespan, dataset_ids in idsByTimespan.items()
204 ],
205 }
206 )
208 def finish(self) -> None:
209 # Docstring inherited from RepoExportBackend.
210 yaml.dump(
211 {
212 "description": "Butler Data Repository Export",
213 "version": str(EXPORT_FORMAT_VERSION),
214 "universe_version": self.universe.version,
215 "universe_namespace": self.universe.namespace,
216 "data": self.data,
217 },
218 stream=self.stream,
219 sort_keys=False,
220 )
223class YamlRepoImportBackend(RepoImportBackend):
224 """A repository import implementation that reads from a YAML file.
226 Parameters
227 ----------
228 stream
229 A readable file-like object.
230 registry : `Registry`
231 The registry datasets will be imported into. Only used to retreive
232 dataset types during construction; all write happen in `register`
233 and `load`.
234 """
236 def __init__(self, stream: IO, registry: Registry):
237 # We read the file fully and convert its contents to Python objects
238 # instead of loading incrementally so we can spot some problems early;
239 # because `register` can't be put inside a transaction, we'd rather not
240 # run that at all if there's going to be problem later in `load`.
241 wrapper = yaml.safe_load(stream)
242 if wrapper["version"] == 0:
243 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
244 # before we really tried to do versioning here.
245 fileVersion = VersionTuple(1, 0, 0)
246 else:
247 fileVersion = VersionTuple.fromString(wrapper["version"])
248 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
249 raise IncompatibleVersionError(
250 f"Cannot read repository export file with version={fileVersion} "
251 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
252 )
253 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
254 raise IncompatibleVersionError(
255 f"Cannot read repository export file with version={fileVersion} "
256 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
257 )
258 self.runs: dict[str, tuple[str | None, Timespan]] = {}
259 self.chains: dict[str, list[str]] = {}
260 self.collections: dict[str, CollectionType] = {}
261 self.collectionDocs: dict[str, str] = {}
262 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
263 self.dimensions: Mapping[DimensionElement, list[DimensionRecord]] = defaultdict(list)
264 self.tagAssociations: dict[str, list[DatasetId]] = defaultdict(list)
265 self.calibAssociations: dict[str, dict[Timespan, list[DatasetId]]] = defaultdict(dict)
266 self.refsByFileId: dict[DatasetId, DatasetRef] = {}
267 self.registry: Registry = registry
269 universe_version = wrapper.get("universe_version", 0)
270 universe_namespace = wrapper.get("universe_namespace", "daf_butler")
272 # If this is data exported before the reorganization of visits
273 # and visit systems and that new schema is in use, some filtering
274 # will be needed. The entry in the visit dimension record will be
275 # silently dropped when visit is created but the
276 # visit_system_membership must be constructed.
277 migrate_visit_system = False
278 if (
279 universe_version < 2
280 and universe_namespace == "daf_butler"
281 and "visit_system_membership" in self.registry.dimensions
282 ):
283 migrate_visit_system = True
285 datasetData = []
286 for data in wrapper["data"]:
287 if data["type"] == "dimension":
288 # convert all datetime values to astropy
289 for record in data["records"]:
290 for key in record:
291 # Some older YAML files were produced with native
292 # YAML support for datetime, we support reading that
293 # data back. Newer conversion uses _AstropyTimeToYAML
294 # class with special YAML tag.
295 if isinstance(record[key], datetime):
296 record[key] = astropy.time.Time(record[key], scale="utc")
297 element = self.registry.dimensions[data["element"]]
298 RecordClass: type[DimensionRecord] = element.RecordClass
299 self.dimensions[element].extend(RecordClass(**r) for r in data["records"])
301 if data["element"] == "visit" and migrate_visit_system:
302 # Must create the visit_system_membership records.
303 element = self.registry.dimensions["visit_system_membership"]
304 RecordClass = element.RecordClass
305 self.dimensions[element].extend(
306 RecordClass(instrument=r["instrument"], visit_system=r["visit_system"], visit=r["id"])
307 for r in data["records"]
308 )
310 elif data["type"] == "collection":
311 collectionType = CollectionType.from_name(data["collection_type"])
312 if collectionType is CollectionType.RUN:
313 self.runs[data["name"]] = (
314 data["host"],
315 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]),
316 )
317 elif collectionType is CollectionType.CHAINED:
318 children = []
319 for child in data["children"]:
320 if not isinstance(child, str):
321 warnings.warn(
322 f"CHAINED collection {data['name']} includes restrictions on child "
323 "collection searches, which are no longer suppored and will be ignored.",
324 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
325 )
326 # Old form with dataset type restrictions only,
327 # supported for backwards compatibility.
328 child, _ = child
329 children.append(child)
330 self.chains[data["name"]] = children
331 else:
332 self.collections[data["name"]] = collectionType
333 doc = data.get("doc")
334 if doc is not None:
335 self.collectionDocs[data["name"]] = doc
336 elif data["type"] == "run":
337 # Also support old form of saving a run with no extra info.
338 self.runs[data["name"]] = (None, Timespan(None, None))
339 elif data["type"] == "dataset_type":
340 dimensions = data["dimensions"]
341 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions:
342 dimensions.remove("visit_system")
343 self.datasetTypes.add(
344 DatasetType(
345 data["name"],
346 dimensions=dimensions,
347 storageClass=data["storage_class"],
348 universe=self.registry.dimensions,
349 isCalibration=data.get("is_calibration", False),
350 )
351 )
352 elif data["type"] == "dataset":
353 # Save raw dataset data for a second loop, so we can ensure we
354 # know about all dataset types first.
355 datasetData.append(data)
356 elif data["type"] == "associations":
357 collectionType = CollectionType.from_name(data["collection_type"])
358 if collectionType is CollectionType.TAGGED:
359 self.tagAssociations[data["collection"]].extend(
360 [x if not isinstance(x, int) else _refIntId2UUID[x] for x in data["dataset_ids"]]
361 )
362 elif collectionType is CollectionType.CALIBRATION:
363 assocsByTimespan = self.calibAssociations[data["collection"]]
364 for d in data["validity_ranges"]:
365 if "timespan" in d:
366 assocsByTimespan[d["timespan"]] = [
367 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"]
368 ]
369 else:
370 # TODO: this is for backward compatibility, should
371 # be removed at some point.
372 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = [
373 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"]
374 ]
375 else:
376 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
377 else:
378 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
379 # key is (dataset type name, run)
380 self.datasets: Mapping[tuple[str, str], list[FileDataset]] = defaultdict(list)
381 for data in datasetData:
382 datasetType = self.datasetTypes.get(data["dataset_type"])
383 if datasetType is None:
384 datasetType = self.registry.getDatasetType(data["dataset_type"])
385 self.datasets[data["dataset_type"], data["run"]].extend(
386 FileDataset(
387 d.get("path"),
388 [
389 DatasetRef(
390 datasetType,
391 dataId,
392 run=data["run"],
393 id=refid if not isinstance(refid, int) else _refIntId2UUID[refid],
394 )
395 for dataId, refid in zip(
396 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]), strict=True
397 )
398 ],
399 formatter=doImportType(d.get("formatter")) if "formatter" in d else None,
400 )
401 for d in data["records"]
402 )
404 def register(self) -> None:
405 # Docstring inherited from RepoImportBackend.register.
406 for datasetType in self.datasetTypes:
407 self.registry.registerDatasetType(datasetType)
408 for run in self.runs:
409 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
410 # No way to add extra run info to registry yet.
411 for collection, collection_type in self.collections.items():
412 self.registry.registerCollection(
413 collection, collection_type, doc=self.collectionDocs.get(collection)
414 )
415 for chain, children in self.chains.items():
416 self.registry.registerCollection(
417 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain)
418 )
419 self.registry.setCollectionChain(chain, children)
421 def load(
422 self,
423 datastore: Datastore | None,
424 *,
425 directory: ResourcePathExpression | None = None,
426 transfer: str | None = None,
427 skip_dimensions: set | None = None,
428 ) -> None:
429 # Docstring inherited from RepoImportBackend.load.
430 for element, dimensionRecords in self.dimensions.items():
431 if skip_dimensions and element in skip_dimensions:
432 continue
433 # Using skip_existing=True here assumes that the records in the
434 # database are either equivalent or at least preferable to the ones
435 # being imported. It'd be ideal to check that, but that would mean
436 # using syncDimensionData, which is not vectorized and is hence
437 # unacceptably slo.
438 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True)
439 # FileDatasets to ingest into the datastore (in bulk):
440 fileDatasets = []
441 for records in self.datasets.values():
442 # Make a big flattened list of all data IDs and dataset_ids, while
443 # remembering slices that associate them with the FileDataset
444 # instances they came from.
445 datasets: list[DatasetRef] = []
446 dataset_ids: list[DatasetId] = []
447 slices = []
448 for fileDataset in records:
449 start = len(datasets)
450 datasets.extend(fileDataset.refs)
451 dataset_ids.extend(ref.id for ref in fileDataset.refs)
452 stop = len(datasets)
453 slices.append(slice(start, stop))
454 # Insert all of those DatasetRefs at once.
455 # For now, we ignore the dataset_id we pulled from the file
456 # and just insert without one to get a new autoincrement value.
457 # Eventually (once we have origin in IDs) we'll preserve them.
458 resolvedRefs = self.registry._importDatasets(datasets)
459 # Populate our dictionary that maps int dataset_id values from the
460 # export file to the new DatasetRefs
461 for fileId, ref in zip(dataset_ids, resolvedRefs, strict=True):
462 self.refsByFileId[fileId] = ref
463 # Now iterate over the original records, and install the new
464 # resolved DatasetRefs to replace the unresolved ones as we
465 # reorganize the collection information.
466 for sliceForFileDataset, fileDataset in zip(slices, records, strict=True):
467 fileDataset.refs = resolvedRefs[sliceForFileDataset]
468 if directory is not None:
469 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path)
470 fileDatasets.append(fileDataset)
471 # Ingest everything into the datastore at once.
472 if datastore is not None and fileDatasets:
473 datastore.ingest(*fileDatasets, transfer=transfer)
474 # Associate datasets with tagged collections.
475 for collection, dataset_ids in self.tagAssociations.items():
476 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
477 # Associate datasets with calibration collections.
478 for collection, idsByTimespan in self.calibAssociations.items():
479 for timespan, dataset_ids in idsByTimespan.items():
480 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)