Coverage for python/lsst/daf/butler/transfers/_yaml.py: 13%
193 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-10-02 08:00 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
32import uuid
33import warnings
34from collections import UserDict, defaultdict
35from collections.abc import Iterable, Mapping
36from datetime import datetime
37from typing import IO, TYPE_CHECKING, Any
39import astropy.time
40import yaml
41from lsst.resources import ResourcePath
42from lsst.utils import doImportType
43from lsst.utils.introspection import find_outside_stacklevel
44from lsst.utils.iteration import ensure_iterable
46from ..core import (
47 DatasetAssociation,
48 DatasetId,
49 DatasetRef,
50 DatasetType,
51 Datastore,
52 DimensionElement,
53 DimensionRecord,
54 DimensionUniverse,
55 FileDataset,
56 Timespan,
57)
58from ..core.named import NamedValueSet
59from ..registry import CollectionType, Registry
60from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord, VersionTuple
61from ..registry.versions import IncompatibleVersionError
62from ._interfaces import RepoExportBackend, RepoImportBackend
64if TYPE_CHECKING:
65 from lsst.resources import ResourcePathExpression
67EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2)
68"""Export format version.
70Files with a different major version or a newer minor version cannot be read by
71this version of the code.
72"""
75class _RefMapper(UserDict[int, uuid.UUID]):
76 """Create a local dict subclass which creates new deterministic UUID for
77 missing keys.
78 """
80 _namespace = uuid.UUID("4d4851f4-2890-4d41-8779-5f38a3f5062b")
82 def __missing__(self, key: int) -> uuid.UUID:
83 newUUID = uuid.uuid3(namespace=self._namespace, name=str(key))
84 self[key] = newUUID
85 return newUUID
88_refIntId2UUID = _RefMapper()
91def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
92 """Generate YAML representation for UUID.
94 This produces a scalar node with a tag "!uuid" and value being a regular
95 string representation of UUID.
96 """
97 return dumper.represent_scalar("!uuid", str(data))
100def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> uuid.UUID | None:
101 if node.value is not None:
102 return uuid.UUID(hex=node.value)
103 return None
106yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
107yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
110class YamlRepoExportBackend(RepoExportBackend):
111 """A repository export implementation that saves to a YAML file.
113 Parameters
114 ----------
115 stream
116 A writeable file-like object.
117 """
119 def __init__(self, stream: IO, universe: DimensionUniverse):
120 self.stream = stream
121 self.universe = universe
122 self.data: list[dict[str, Any]] = []
124 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
125 # Docstring inherited from RepoExportBackend.saveDimensionData.
126 data_dicts = [record.toDict(splitTimespan=True) for record in data]
127 self.data.append(
128 {
129 "type": "dimension",
130 "element": element.name,
131 "records": data_dicts,
132 }
133 )
135 def saveCollection(self, record: CollectionRecord, doc: str | None) -> None:
136 # Docstring inherited from RepoExportBackend.saveCollections.
137 data: dict[str, Any] = {
138 "type": "collection",
139 "collection_type": record.type.name,
140 "name": record.name,
141 }
142 if doc is not None:
143 data["doc"] = doc
144 if isinstance(record, RunRecord):
145 data["host"] = record.host
146 data["timespan_begin"] = record.timespan.begin
147 data["timespan_end"] = record.timespan.end
148 elif isinstance(record, ChainedCollectionRecord):
149 data["children"] = list(record.children)
150 self.data.append(data)
152 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
153 # Docstring inherited from RepoExportBackend.saveDatasets.
154 self.data.append(
155 {
156 "type": "dataset_type",
157 "name": datasetType.name,
158 "dimensions": [d.name for d in datasetType.dimensions],
159 "storage_class": datasetType.storageClass_name,
160 "is_calibration": datasetType.isCalibration(),
161 }
162 )
163 self.data.append(
164 {
165 "type": "dataset",
166 "dataset_type": datasetType.name,
167 "run": run,
168 "records": [
169 {
170 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
171 "data_id": [ref.dataId.byName() for ref in sorted(dataset.refs)],
172 "path": dataset.path,
173 "formatter": dataset.formatter,
174 # TODO: look up and save other collections
175 }
176 for dataset in datasets
177 ],
178 }
179 )
181 def saveDatasetAssociations(
182 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation]
183 ) -> None:
184 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
185 if collectionType is CollectionType.TAGGED:
186 self.data.append(
187 {
188 "type": "associations",
189 "collection": collection,
190 "collection_type": collectionType.name,
191 "dataset_ids": [assoc.ref.id for assoc in associations],
192 }
193 )
194 elif collectionType is CollectionType.CALIBRATION:
195 idsByTimespan: dict[Timespan, list[DatasetId]] = defaultdict(list)
196 for association in associations:
197 assert association.timespan is not None
198 idsByTimespan[association.timespan].append(association.ref.id)
199 self.data.append(
200 {
201 "type": "associations",
202 "collection": collection,
203 "collection_type": collectionType.name,
204 "validity_ranges": [
205 {
206 "timespan": timespan,
207 "dataset_ids": dataset_ids,
208 }
209 for timespan, dataset_ids in idsByTimespan.items()
210 ],
211 }
212 )
214 def finish(self) -> None:
215 # Docstring inherited from RepoExportBackend.
216 yaml.dump(
217 {
218 "description": "Butler Data Repository Export",
219 "version": str(EXPORT_FORMAT_VERSION),
220 "universe_version": self.universe.version,
221 "universe_namespace": self.universe.namespace,
222 "data": self.data,
223 },
224 stream=self.stream,
225 sort_keys=False,
226 )
229class YamlRepoImportBackend(RepoImportBackend):
230 """A repository import implementation that reads from a YAML file.
232 Parameters
233 ----------
234 stream
235 A readable file-like object.
236 registry : `Registry`
237 The registry datasets will be imported into. Only used to retreive
238 dataset types during construction; all write happen in `register`
239 and `load`.
240 """
242 def __init__(self, stream: IO, registry: Registry):
243 # We read the file fully and convert its contents to Python objects
244 # instead of loading incrementally so we can spot some problems early;
245 # because `register` can't be put inside a transaction, we'd rather not
246 # run that at all if there's going to be problem later in `load`.
247 wrapper = yaml.safe_load(stream)
248 if wrapper["version"] == 0:
249 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
250 # before we really tried to do versioning here.
251 fileVersion = VersionTuple(1, 0, 0)
252 else:
253 fileVersion = VersionTuple.fromString(wrapper["version"])
254 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
255 raise IncompatibleVersionError(
256 f"Cannot read repository export file with version={fileVersion} "
257 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
258 )
259 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
260 raise IncompatibleVersionError(
261 f"Cannot read repository export file with version={fileVersion} "
262 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
263 )
264 self.runs: dict[str, tuple[str | None, Timespan]] = {}
265 self.chains: dict[str, list[str]] = {}
266 self.collections: dict[str, CollectionType] = {}
267 self.collectionDocs: dict[str, str] = {}
268 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
269 self.dimensions: Mapping[DimensionElement, list[DimensionRecord]] = defaultdict(list)
270 self.tagAssociations: dict[str, list[DatasetId]] = defaultdict(list)
271 self.calibAssociations: dict[str, dict[Timespan, list[DatasetId]]] = defaultdict(dict)
272 self.refsByFileId: dict[DatasetId, DatasetRef] = {}
273 self.registry: Registry = registry
275 universe_version = wrapper.get("universe_version", 0)
276 universe_namespace = wrapper.get("universe_namespace", "daf_butler")
278 # If this is data exported before the reorganization of visits
279 # and visit systems and that new schema is in use, some filtering
280 # will be needed. The entry in the visit dimension record will be
281 # silently dropped when visit is created but the
282 # visit_system_membership must be constructed.
283 migrate_visit_system = False
284 if (
285 universe_version < 2
286 and universe_namespace == "daf_butler"
287 and "visit_system_membership" in self.registry.dimensions
288 ):
289 migrate_visit_system = True
291 datasetData = []
292 for data in wrapper["data"]:
293 if data["type"] == "dimension":
294 # convert all datetime values to astropy
295 for record in data["records"]:
296 for key in record:
297 # Some older YAML files were produced with native
298 # YAML support for datetime, we support reading that
299 # data back. Newer conversion uses _AstropyTimeToYAML
300 # class with special YAML tag.
301 if isinstance(record[key], datetime):
302 record[key] = astropy.time.Time(record[key], scale="utc")
303 element = self.registry.dimensions[data["element"]]
304 RecordClass: type[DimensionRecord] = element.RecordClass
305 self.dimensions[element].extend(RecordClass(**r) for r in data["records"])
307 if data["element"] == "visit" and migrate_visit_system:
308 # Must create the visit_system_membership records.
309 element = self.registry.dimensions["visit_system_membership"]
310 RecordClass = element.RecordClass
311 self.dimensions[element].extend(
312 RecordClass(instrument=r["instrument"], visit_system=r["visit_system"], visit=r["id"])
313 for r in data["records"]
314 )
316 elif data["type"] == "collection":
317 collectionType = CollectionType.from_name(data["collection_type"])
318 if collectionType is CollectionType.RUN:
319 self.runs[data["name"]] = (
320 data["host"],
321 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]),
322 )
323 elif collectionType is CollectionType.CHAINED:
324 children = []
325 for child in data["children"]:
326 if not isinstance(child, str):
327 warnings.warn(
328 f"CHAINED collection {data['name']} includes restrictions on child "
329 "collection searches, which are no longer suppored and will be ignored.",
330 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
331 )
332 # Old form with dataset type restrictions only,
333 # supported for backwards compatibility.
334 child, _ = child
335 children.append(child)
336 self.chains[data["name"]] = children
337 else:
338 self.collections[data["name"]] = collectionType
339 doc = data.get("doc")
340 if doc is not None:
341 self.collectionDocs[data["name"]] = doc
342 elif data["type"] == "run":
343 # Also support old form of saving a run with no extra info.
344 self.runs[data["name"]] = (None, Timespan(None, None))
345 elif data["type"] == "dataset_type":
346 dimensions = data["dimensions"]
347 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions:
348 dimensions.remove("visit_system")
349 self.datasetTypes.add(
350 DatasetType(
351 data["name"],
352 dimensions=dimensions,
353 storageClass=data["storage_class"],
354 universe=self.registry.dimensions,
355 isCalibration=data.get("is_calibration", False),
356 )
357 )
358 elif data["type"] == "dataset":
359 # Save raw dataset data for a second loop, so we can ensure we
360 # know about all dataset types first.
361 datasetData.append(data)
362 elif data["type"] == "associations":
363 collectionType = CollectionType.from_name(data["collection_type"])
364 if collectionType is CollectionType.TAGGED:
365 self.tagAssociations[data["collection"]].extend(
366 [x if not isinstance(x, int) else _refIntId2UUID[x] for x in data["dataset_ids"]]
367 )
368 elif collectionType is CollectionType.CALIBRATION:
369 assocsByTimespan = self.calibAssociations[data["collection"]]
370 for d in data["validity_ranges"]:
371 if "timespan" in d:
372 assocsByTimespan[d["timespan"]] = [
373 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"]
374 ]
375 else:
376 # TODO: this is for backward compatibility, should
377 # be removed at some point.
378 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = [
379 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"]
380 ]
381 else:
382 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
383 else:
384 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
385 # key is (dataset type name, run)
386 self.datasets: Mapping[tuple[str, str], list[FileDataset]] = defaultdict(list)
387 for data in datasetData:
388 datasetType = self.datasetTypes.get(data["dataset_type"])
389 if datasetType is None:
390 datasetType = self.registry.getDatasetType(data["dataset_type"])
391 self.datasets[data["dataset_type"], data["run"]].extend(
392 FileDataset(
393 d.get("path"),
394 [
395 DatasetRef(
396 datasetType,
397 dataId,
398 run=data["run"],
399 id=refid if not isinstance(refid, int) else _refIntId2UUID[refid],
400 )
401 for dataId, refid in zip(
402 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]), strict=True
403 )
404 ],
405 formatter=doImportType(d.get("formatter")) if "formatter" in d else None,
406 )
407 for d in data["records"]
408 )
410 def register(self) -> None:
411 # Docstring inherited from RepoImportBackend.register.
412 for datasetType in self.datasetTypes:
413 self.registry.registerDatasetType(datasetType)
414 for run in self.runs:
415 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
416 # No way to add extra run info to registry yet.
417 for collection, collection_type in self.collections.items():
418 self.registry.registerCollection(
419 collection, collection_type, doc=self.collectionDocs.get(collection)
420 )
421 for chain, children in self.chains.items():
422 self.registry.registerCollection(
423 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain)
424 )
425 self.registry.setCollectionChain(chain, children)
427 def load(
428 self,
429 datastore: Datastore | None,
430 *,
431 directory: ResourcePathExpression | None = None,
432 transfer: str | None = None,
433 skip_dimensions: set | None = None,
434 ) -> None:
435 # Docstring inherited from RepoImportBackend.load.
436 for element, dimensionRecords in self.dimensions.items():
437 if skip_dimensions and element in skip_dimensions:
438 continue
439 # Using skip_existing=True here assumes that the records in the
440 # database are either equivalent or at least preferable to the ones
441 # being imported. It'd be ideal to check that, but that would mean
442 # using syncDimensionData, which is not vectorized and is hence
443 # unacceptably slo.
444 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True)
445 # FileDatasets to ingest into the datastore (in bulk):
446 fileDatasets = []
447 for records in self.datasets.values():
448 # Make a big flattened list of all data IDs and dataset_ids, while
449 # remembering slices that associate them with the FileDataset
450 # instances they came from.
451 datasets: list[DatasetRef] = []
452 dataset_ids: list[DatasetId] = []
453 slices = []
454 for fileDataset in records:
455 start = len(datasets)
456 datasets.extend(fileDataset.refs)
457 dataset_ids.extend(ref.id for ref in fileDataset.refs)
458 stop = len(datasets)
459 slices.append(slice(start, stop))
460 # Insert all of those DatasetRefs at once.
461 # For now, we ignore the dataset_id we pulled from the file
462 # and just insert without one to get a new autoincrement value.
463 # Eventually (once we have origin in IDs) we'll preserve them.
464 resolvedRefs = self.registry._importDatasets(datasets)
465 # Populate our dictionary that maps int dataset_id values from the
466 # export file to the new DatasetRefs
467 for fileId, ref in zip(dataset_ids, resolvedRefs, strict=True):
468 self.refsByFileId[fileId] = ref
469 # Now iterate over the original records, and install the new
470 # resolved DatasetRefs to replace the unresolved ones as we
471 # reorganize the collection information.
472 for sliceForFileDataset, fileDataset in zip(slices, records, strict=True):
473 fileDataset.refs = resolvedRefs[sliceForFileDataset]
474 if directory is not None:
475 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path)
476 fileDatasets.append(fileDataset)
477 # Ingest everything into the datastore at once.
478 if datastore is not None and fileDatasets:
479 datastore.ingest(*fileDatasets, transfer=transfer)
480 # Associate datasets with tagged collections.
481 for collection, dataset_ids in self.tagAssociations.items():
482 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
483 # Associate datasets with calibration collections.
484 for collection, idsByTimespan in self.calibAssociations.items():
485 for timespan, dataset_ids in idsByTimespan.items():
486 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)