Coverage for python/lsst/daf/butler/transfers/_yaml.py: 14%
208 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:20 +0000
« prev ^ index » next coverage.py v7.4.1, created at 2024-02-01 11:20 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
32import uuid
33import warnings
34from collections import UserDict, defaultdict
35from collections.abc import Iterable, Mapping
36from datetime import datetime
37from typing import IO, TYPE_CHECKING, Any
39import astropy.time
40import yaml
41from lsst.resources import ResourcePath
42from lsst.utils import doImportType
43from lsst.utils.introspection import find_outside_stacklevel
44from lsst.utils.iteration import ensure_iterable
46from .._dataset_association import DatasetAssociation
47from .._dataset_ref import DatasetId, DatasetRef
48from .._dataset_type import DatasetType
49from .._file_dataset import FileDataset
50from .._named import NamedValueSet
51from .._timespan import Timespan
52from ..datastore import Datastore
53from ..dimensions import DimensionElement, DimensionRecord, DimensionUniverse
54from ..registry import CollectionType
55from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord, VersionTuple
56from ..registry.sql_registry import SqlRegistry
57from ..registry.versions import IncompatibleVersionError
58from ._interfaces import RepoExportBackend, RepoImportBackend
60if TYPE_CHECKING:
61 from lsst.resources import ResourcePathExpression
63EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2)
64"""Export format version.
66Files with a different major version or a newer minor version cannot be read by
67this version of the code.
68"""
71class _RefMapper(UserDict[int, uuid.UUID]):
72 """Create a local dict subclass which creates new deterministic UUID for
73 missing keys.
74 """
76 _namespace = uuid.UUID("4d4851f4-2890-4d41-8779-5f38a3f5062b")
78 def __missing__(self, key: int) -> uuid.UUID:
79 newUUID = uuid.uuid3(namespace=self._namespace, name=str(key))
80 self[key] = newUUID
81 return newUUID
84_refIntId2UUID = _RefMapper()
87def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
88 """Generate YAML representation for UUID.
90 This produces a scalar node with a tag "!uuid" and value being a regular
91 string representation of UUID.
92 """
93 return dumper.represent_scalar("!uuid", str(data))
96def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> uuid.UUID | None:
97 if node.value is not None:
98 return uuid.UUID(hex=node.value)
99 return None
102yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
103yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
106class YamlRepoExportBackend(RepoExportBackend):
107 """A repository export implementation that saves to a YAML file.
109 Parameters
110 ----------
111 stream : `io.IO`
112 A writeable file-like object.
113 universe : `DimensionUniverse`
114 The dimension universe to use for the export.
115 """
117 def __init__(self, stream: IO, universe: DimensionUniverse):
118 self.stream = stream
119 self.universe = universe
120 self.data: list[dict[str, Any]] = []
122 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
123 # Docstring inherited from RepoExportBackend.saveDimensionData.
124 data_dicts = [record.toDict(splitTimespan=True) for record in data]
125 self.data.append(
126 {
127 "type": "dimension",
128 "element": element.name,
129 "records": data_dicts,
130 }
131 )
133 def saveCollection(self, record: CollectionRecord, doc: str | None) -> None:
134 # Docstring inherited from RepoExportBackend.saveCollections.
135 data: dict[str, Any] = {
136 "type": "collection",
137 "collection_type": record.type.name,
138 "name": record.name,
139 }
140 if doc is not None:
141 data["doc"] = doc
142 if isinstance(record, RunRecord):
143 data["host"] = record.host
144 data["timespan_begin"] = record.timespan.begin
145 data["timespan_end"] = record.timespan.end
146 elif isinstance(record, ChainedCollectionRecord):
147 data["children"] = list(record.children)
148 self.data.append(data)
150 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
151 # Docstring inherited from RepoExportBackend.saveDatasets.
152 self.data.append(
153 {
154 "type": "dataset_type",
155 "name": datasetType.name,
156 "dimensions": list(datasetType.dimensions.names),
157 "storage_class": datasetType.storageClass_name,
158 "is_calibration": datasetType.isCalibration(),
159 }
160 )
161 self.data.append(
162 {
163 "type": "dataset",
164 "dataset_type": datasetType.name,
165 "run": run,
166 "records": [
167 {
168 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
169 "data_id": [dict(ref.dataId.required) for ref in sorted(dataset.refs)],
170 "path": dataset.path,
171 "formatter": dataset.formatter,
172 # TODO: look up and save other collections
173 }
174 for dataset in datasets
175 ],
176 }
177 )
179 def saveDatasetAssociations(
180 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation]
181 ) -> None:
182 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
183 if collectionType is CollectionType.TAGGED:
184 self.data.append(
185 {
186 "type": "associations",
187 "collection": collection,
188 "collection_type": collectionType.name,
189 "dataset_ids": [assoc.ref.id for assoc in associations],
190 }
191 )
192 elif collectionType is CollectionType.CALIBRATION:
193 idsByTimespan: dict[Timespan, list[DatasetId]] = defaultdict(list)
194 for association in associations:
195 assert association.timespan is not None
196 idsByTimespan[association.timespan].append(association.ref.id)
197 self.data.append(
198 {
199 "type": "associations",
200 "collection": collection,
201 "collection_type": collectionType.name,
202 "validity_ranges": [
203 {
204 "timespan": timespan,
205 "dataset_ids": dataset_ids,
206 }
207 for timespan, dataset_ids in idsByTimespan.items()
208 ],
209 }
210 )
212 def finish(self) -> None:
213 # Docstring inherited from RepoExportBackend.
214 yaml.dump(
215 {
216 "description": "Butler Data Repository Export",
217 "version": str(EXPORT_FORMAT_VERSION),
218 "universe_version": self.universe.version,
219 "universe_namespace": self.universe.namespace,
220 "data": self.data,
221 },
222 stream=self.stream,
223 sort_keys=False,
224 )
227class YamlRepoImportBackend(RepoImportBackend):
228 """A repository import implementation that reads from a YAML file.
230 Parameters
231 ----------
232 stream : `io.IO`
233 A readable file-like object.
234 registry : `SqlRegistry`
235 The registry datasets will be imported into. Only used to retreive
236 dataset types during construction; all write happen in `register`
237 and `load`.
238 """
240 def __init__(self, stream: IO, registry: SqlRegistry):
241 # We read the file fully and convert its contents to Python objects
242 # instead of loading incrementally so we can spot some problems early;
243 # because `register` can't be put inside a transaction, we'd rather not
244 # run that at all if there's going to be problem later in `load`.
245 wrapper = yaml.safe_load(stream)
246 if wrapper["version"] == 0:
247 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
248 # before we really tried to do versioning here.
249 fileVersion = VersionTuple(1, 0, 0)
250 else:
251 fileVersion = VersionTuple.fromString(wrapper["version"])
252 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
253 raise IncompatibleVersionError(
254 f"Cannot read repository export file with version={fileVersion} "
255 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
256 )
257 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
258 raise IncompatibleVersionError(
259 f"Cannot read repository export file with version={fileVersion} "
260 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
261 )
262 self.runs: dict[str, tuple[str | None, Timespan]] = {}
263 self.chains: dict[str, list[str]] = {}
264 self.collections: dict[str, CollectionType] = {}
265 self.collectionDocs: dict[str, str] = {}
266 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
267 self.dimensions: Mapping[DimensionElement, list[DimensionRecord]] = defaultdict(list)
268 self.tagAssociations: dict[str, list[DatasetId]] = defaultdict(list)
269 self.calibAssociations: dict[str, dict[Timespan, list[DatasetId]]] = defaultdict(dict)
270 self.refsByFileId: dict[DatasetId, DatasetRef] = {}
271 self.registry: SqlRegistry = registry
273 universe_version = wrapper.get("universe_version", 0)
274 universe_namespace = wrapper.get("universe_namespace", "daf_butler")
276 # If this is data exported before the reorganization of visits
277 # and visit systems and that new schema is in use, some filtering
278 # will be needed. The entry in the visit dimension record will be
279 # silently dropped when visit is created but the
280 # visit_system_membership must be constructed.
281 migrate_visit_system = False
282 if (
283 universe_version < 2
284 and universe_namespace == "daf_butler"
285 and "visit_system_membership" in self.registry.dimensions
286 ):
287 migrate_visit_system = True
289 # Drop "seeing" from visits in files older than version 1.
290 migrate_visit_seeing = False
291 if (
292 universe_version < 1
293 and universe_namespace == "daf_butler"
294 and "visit" in self.registry.dimensions
295 and "seeing" not in self.registry.dimensions["visit"].metadata
296 ):
297 migrate_visit_seeing = True
299 datasetData = []
300 RecordClass: type[DimensionRecord]
301 for data in wrapper["data"]:
302 if data["type"] == "dimension":
303 # convert all datetime values to astropy
304 for record in data["records"]:
305 for key in record:
306 # Some older YAML files were produced with native
307 # YAML support for datetime, we support reading that
308 # data back. Newer conversion uses _AstropyTimeToYAML
309 # class with special YAML tag.
310 if isinstance(record[key], datetime):
311 record[key] = astropy.time.Time(record[key], scale="utc")
313 if data["element"] == "visit":
314 if migrate_visit_system:
315 # Must create the visit_system_membership records.
316 # But first create empty list for visits since other
317 # logic in this file depends on self.dimensions being
318 # populated in an order consisteny with primary keys.
319 self.dimensions[self.registry.dimensions["visit"]] = []
320 element = self.registry.dimensions["visit_system_membership"]
321 RecordClass = element.RecordClass
322 self.dimensions[element].extend(
323 RecordClass(
324 instrument=r["instrument"], visit_system=r.pop("visit_system"), visit=r["id"]
325 )
326 for r in data["records"]
327 )
328 if migrate_visit_seeing:
329 for record in data["records"]:
330 record.pop("seeing", None)
332 element = self.registry.dimensions[data["element"]]
333 RecordClass = element.RecordClass
334 self.dimensions[element].extend(RecordClass(**r) for r in data["records"])
336 elif data["type"] == "collection":
337 collectionType = CollectionType.from_name(data["collection_type"])
338 if collectionType is CollectionType.RUN:
339 self.runs[data["name"]] = (
340 data["host"],
341 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]),
342 )
343 elif collectionType is CollectionType.CHAINED:
344 children = []
345 for child in data["children"]:
346 if not isinstance(child, str):
347 warnings.warn(
348 f"CHAINED collection {data['name']} includes restrictions on child "
349 "collection searches, which are no longer suppored and will be ignored.",
350 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
351 )
352 # Old form with dataset type restrictions only,
353 # supported for backwards compatibility.
354 child, _ = child
355 children.append(child)
356 self.chains[data["name"]] = children
357 else:
358 self.collections[data["name"]] = collectionType
359 doc = data.get("doc")
360 if doc is not None:
361 self.collectionDocs[data["name"]] = doc
362 elif data["type"] == "run":
363 # Also support old form of saving a run with no extra info.
364 self.runs[data["name"]] = (None, Timespan(None, None))
365 elif data["type"] == "dataset_type":
366 dimensions = data["dimensions"]
367 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions:
368 dimensions.remove("visit_system")
369 self.datasetTypes.add(
370 DatasetType(
371 data["name"],
372 dimensions=dimensions,
373 storageClass=data["storage_class"],
374 universe=self.registry.dimensions,
375 isCalibration=data.get("is_calibration", False),
376 )
377 )
378 elif data["type"] == "dataset":
379 # Save raw dataset data for a second loop, so we can ensure we
380 # know about all dataset types first.
381 datasetData.append(data)
382 elif data["type"] == "associations":
383 collectionType = CollectionType.from_name(data["collection_type"])
384 if collectionType is CollectionType.TAGGED:
385 self.tagAssociations[data["collection"]].extend(
386 [x if not isinstance(x, int) else _refIntId2UUID[x] for x in data["dataset_ids"]]
387 )
388 elif collectionType is CollectionType.CALIBRATION:
389 assocsByTimespan = self.calibAssociations[data["collection"]]
390 for d in data["validity_ranges"]:
391 if "timespan" in d:
392 assocsByTimespan[d["timespan"]] = [
393 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"]
394 ]
395 else:
396 # TODO: this is for backward compatibility, should
397 # be removed at some point.
398 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = [
399 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"]
400 ]
401 else:
402 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
403 else:
404 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
405 # key is (dataset type name, run)
406 self.datasets: Mapping[tuple[str, str], list[FileDataset]] = defaultdict(list)
407 for data in datasetData:
408 datasetType = self.datasetTypes.get(data["dataset_type"])
409 if datasetType is None:
410 datasetType = self.registry.getDatasetType(data["dataset_type"])
411 self.datasets[data["dataset_type"], data["run"]].extend(
412 FileDataset(
413 d.get("path"),
414 [
415 DatasetRef(
416 datasetType,
417 dataId,
418 run=data["run"],
419 id=refid if not isinstance(refid, int) else _refIntId2UUID[refid],
420 )
421 for dataId, refid in zip(
422 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]), strict=True
423 )
424 ],
425 formatter=doImportType(d.get("formatter")) if "formatter" in d else None,
426 )
427 for d in data["records"]
428 )
430 def register(self) -> None:
431 # Docstring inherited from RepoImportBackend.register.
432 for datasetType in self.datasetTypes:
433 self.registry.registerDatasetType(datasetType)
434 for run in self.runs:
435 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
436 # No way to add extra run info to registry yet.
437 for collection, collection_type in self.collections.items():
438 self.registry.registerCollection(
439 collection, collection_type, doc=self.collectionDocs.get(collection)
440 )
441 for chain, children in self.chains.items():
442 self.registry.registerCollection(
443 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain)
444 )
445 self.registry.setCollectionChain(chain, children)
447 def load(
448 self,
449 datastore: Datastore | None,
450 *,
451 directory: ResourcePathExpression | None = None,
452 transfer: str | None = None,
453 skip_dimensions: set | None = None,
454 ) -> None:
455 # Docstring inherited from RepoImportBackend.load.
456 for element, dimensionRecords in self.dimensions.items():
457 if skip_dimensions and element in skip_dimensions:
458 continue
459 # Using skip_existing=True here assumes that the records in the
460 # database are either equivalent or at least preferable to the ones
461 # being imported. It'd be ideal to check that, but that would mean
462 # using syncDimensionData, which is not vectorized and is hence
463 # unacceptably slo.
464 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True)
465 # FileDatasets to ingest into the datastore (in bulk):
466 fileDatasets = []
467 for records in self.datasets.values():
468 # Make a big flattened list of all data IDs and dataset_ids, while
469 # remembering slices that associate them with the FileDataset
470 # instances they came from.
471 datasets: list[DatasetRef] = []
472 dataset_ids: list[DatasetId] = []
473 slices = []
474 for fileDataset in records:
475 start = len(datasets)
476 datasets.extend(fileDataset.refs)
477 dataset_ids.extend(ref.id for ref in fileDataset.refs)
478 stop = len(datasets)
479 slices.append(slice(start, stop))
480 # Insert all of those DatasetRefs at once.
481 # For now, we ignore the dataset_id we pulled from the file
482 # and just insert without one to get a new autoincrement value.
483 # Eventually (once we have origin in IDs) we'll preserve them.
484 resolvedRefs = self.registry._importDatasets(datasets)
485 # Populate our dictionary that maps int dataset_id values from the
486 # export file to the new DatasetRefs
487 for fileId, ref in zip(dataset_ids, resolvedRefs, strict=True):
488 self.refsByFileId[fileId] = ref
489 # Now iterate over the original records, and install the new
490 # resolved DatasetRefs to replace the unresolved ones as we
491 # reorganize the collection information.
492 for sliceForFileDataset, fileDataset in zip(slices, records, strict=True):
493 fileDataset.refs = resolvedRefs[sliceForFileDataset]
494 if directory is not None:
495 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path)
496 fileDatasets.append(fileDataset)
497 # Ingest everything into the datastore at once.
498 if datastore is not None and fileDatasets:
499 datastore.ingest(*fileDatasets, transfer=transfer)
500 # Associate datasets with tagged collections.
501 for collection, dataset_ids in self.tagAssociations.items():
502 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
503 # Associate datasets with calibration collections.
504 for collection, idsByTimespan in self.calibAssociations.items():
505 for timespan, dataset_ids in idsByTimespan.items():
506 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)