Coverage for python/lsst/daf/butler/transfers/_yaml.py: 12%
299 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-30 02:51 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-03-30 02:51 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = ["YamlRepoExportBackend", "YamlRepoImportBackend"]
32import logging
33import uuid
34import warnings
35from collections import UserDict, defaultdict
36from collections.abc import Iterable, Mapping
37from datetime import datetime
38from typing import IO, TYPE_CHECKING, Any
40import astropy.time
41import yaml
42from lsst.resources import ResourcePath
43from lsst.utils import doImportType
44from lsst.utils.introspection import find_outside_stacklevel
45from lsst.utils.iteration import ensure_iterable
47from .._dataset_association import DatasetAssociation
48from .._dataset_ref import DatasetId, DatasetRef
49from .._dataset_type import DatasetType
50from .._file_dataset import FileDataset
51from .._named import NamedValueSet
52from .._timespan import Timespan
53from ..datastore import Datastore
54from ..dimensions import DimensionElement, DimensionRecord, DimensionUniverse
55from ..registry import CollectionType
56from ..registry.interfaces import ChainedCollectionRecord, CollectionRecord, RunRecord, VersionTuple
57from ..registry.sql_registry import SqlRegistry
58from ..registry.versions import IncompatibleVersionError
59from ._interfaces import RepoExportBackend, RepoImportBackend
61if TYPE_CHECKING:
62 from lsst.resources import ResourcePathExpression
64_LOG = logging.getLogger(__name__)
66EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 2)
67"""Export format version.
69Files with a different major version or a newer minor version cannot be read by
70this version of the code.
71"""
74class _RefMapper(UserDict[int, uuid.UUID]):
75 """Create a local dict subclass which creates new deterministic UUID for
76 missing keys.
77 """
79 _namespace = uuid.UUID("4d4851f4-2890-4d41-8779-5f38a3f5062b")
81 def __missing__(self, key: int) -> uuid.UUID:
82 newUUID = uuid.uuid3(namespace=self._namespace, name=str(key))
83 self[key] = newUUID
84 return newUUID
87_refIntId2UUID = _RefMapper()
90def _uuid_representer(dumper: yaml.Dumper, data: uuid.UUID) -> yaml.Node:
91 """Generate YAML representation for UUID.
93 This produces a scalar node with a tag "!uuid" and value being a regular
94 string representation of UUID.
95 """
96 return dumper.represent_scalar("!uuid", str(data))
99def _uuid_constructor(loader: yaml.Loader, node: yaml.Node) -> uuid.UUID | None:
100 if node.value is not None:
101 return uuid.UUID(hex=node.value)
102 return None
105yaml.Dumper.add_representer(uuid.UUID, _uuid_representer)
106yaml.SafeLoader.add_constructor("!uuid", _uuid_constructor)
109class YamlRepoExportBackend(RepoExportBackend):
110 """A repository export implementation that saves to a YAML file.
112 Parameters
113 ----------
114 stream : `io.IO`
115 A writeable file-like object.
116 universe : `DimensionUniverse`
117 The dimension universe to use for the export.
118 """
120 def __init__(self, stream: IO, universe: DimensionUniverse):
121 self.stream = stream
122 self.universe = universe
123 self.data: list[dict[str, Any]] = []
125 def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -> None:
126 # Docstring inherited from RepoExportBackend.saveDimensionData.
127 data_dicts = [record.toDict(splitTimespan=True) for record in data]
128 self.data.append(
129 {
130 "type": "dimension",
131 "element": element.name,
132 "records": data_dicts,
133 }
134 )
136 def saveCollection(self, record: CollectionRecord, doc: str | None) -> None:
137 # Docstring inherited from RepoExportBackend.saveCollections.
138 data: dict[str, Any] = {
139 "type": "collection",
140 "collection_type": record.type.name,
141 "name": record.name,
142 }
143 if doc is not None:
144 data["doc"] = doc
145 if isinstance(record, RunRecord):
146 data["host"] = record.host
147 data["timespan_begin"] = record.timespan.begin
148 data["timespan_end"] = record.timespan.end
149 elif isinstance(record, ChainedCollectionRecord):
150 data["children"] = list(record.children)
151 self.data.append(data)
153 def saveDatasets(self, datasetType: DatasetType, run: str, *datasets: FileDataset) -> None:
154 # Docstring inherited from RepoExportBackend.saveDatasets.
155 self.data.append(
156 {
157 "type": "dataset_type",
158 "name": datasetType.name,
159 "dimensions": list(datasetType.dimensions.names),
160 "storage_class": datasetType.storageClass_name,
161 "is_calibration": datasetType.isCalibration(),
162 }
163 )
164 self.data.append(
165 {
166 "type": "dataset",
167 "dataset_type": datasetType.name,
168 "run": run,
169 "records": [
170 {
171 "dataset_id": [ref.id for ref in sorted(dataset.refs)],
172 "data_id": [dict(ref.dataId.required) for ref in sorted(dataset.refs)],
173 "path": dataset.path,
174 "formatter": dataset.formatter,
175 # TODO: look up and save other collections
176 }
177 for dataset in datasets
178 ],
179 }
180 )
182 def saveDatasetAssociations(
183 self, collection: str, collectionType: CollectionType, associations: Iterable[DatasetAssociation]
184 ) -> None:
185 # Docstring inherited from RepoExportBackend.saveDatasetAssociations.
186 if collectionType is CollectionType.TAGGED:
187 self.data.append(
188 {
189 "type": "associations",
190 "collection": collection,
191 "collection_type": collectionType.name,
192 "dataset_ids": [assoc.ref.id for assoc in associations],
193 }
194 )
195 elif collectionType is CollectionType.CALIBRATION:
196 idsByTimespan: dict[Timespan, list[DatasetId]] = defaultdict(list)
197 for association in associations:
198 assert association.timespan is not None
199 idsByTimespan[association.timespan].append(association.ref.id)
200 self.data.append(
201 {
202 "type": "associations",
203 "collection": collection,
204 "collection_type": collectionType.name,
205 "validity_ranges": [
206 {
207 "timespan": timespan,
208 "dataset_ids": dataset_ids,
209 }
210 for timespan, dataset_ids in idsByTimespan.items()
211 ],
212 }
213 )
215 def finish(self) -> None:
216 # Docstring inherited from RepoExportBackend.
217 yaml.dump(
218 {
219 "description": "Butler Data Repository Export",
220 "version": str(EXPORT_FORMAT_VERSION),
221 "universe_version": self.universe.version,
222 "universe_namespace": self.universe.namespace,
223 "data": self.data,
224 },
225 stream=self.stream,
226 sort_keys=False,
227 )
230class _DayObsOffsetCalculator:
231 """Interface to allow the day_obs offset to be calculated from an
232 instrument class name and cached.
233 """
235 name_to_class_name: dict[str, str]
236 name_to_offset: dict[str, int | None]
238 def __init__(self) -> None:
239 self.name_to_class_name = {}
240 self.name_to_offset = {}
242 def __setitem__(self, name: str, class_name: str) -> None:
243 """Store the instrument class name.
245 Parameters
246 ----------
247 name : `str`
248 Name of the instrument.
249 class_name : `str`
250 Full name of the instrument class.
251 """
252 self.name_to_class_name[name] = class_name
254 def get_offset(self, name: str, date: astropy.time.Time) -> int | None:
255 """Return the offset to use when calculating day_obs.
257 Parameters
258 ----------
259 name : `str`
260 The instrument name.
261 date : `astropy.time.Time`
262 Time for which the offset is required.
264 Returns
265 -------
266 offset : `int`
267 The offset in seconds.
268 """
269 if name in self.name_to_offset:
270 return self.name_to_offset[name]
272 try:
273 instrument_class = doImportType(self.name_to_class_name[name])
274 except Exception:
275 # Any error at all, store None and do not try again.
276 self.name_to_offset[name] = None
277 return None
279 # Assume this is a `lsst.pipe.base.Instrument` and that it has
280 # a translatorClass property pointing to an
281 # astro_metadata_translator.MetadataTranslator class. If this is not
282 # true give up and store None.
283 try:
284 offset_delta = instrument_class.translatorClass.observing_date_to_offset(date) # type: ignore
285 except Exception:
286 offset_delta = None
288 if offset_delta is None:
289 self.name_to_offset[name] = None
290 return None
292 self.name_to_offset[name] = round(offset_delta.to_value("s"))
293 return self.name_to_offset[name]
296class YamlRepoImportBackend(RepoImportBackend):
297 """A repository import implementation that reads from a YAML file.
299 Parameters
300 ----------
301 stream : `io.IO`
302 A readable file-like object.
303 registry : `SqlRegistry`
304 The registry datasets will be imported into. Only used to retreive
305 dataset types during construction; all write happen in `register`
306 and `load`.
307 """
309 def __init__(self, stream: IO, registry: SqlRegistry):
310 # We read the file fully and convert its contents to Python objects
311 # instead of loading incrementally so we can spot some problems early;
312 # because `register` can't be put inside a transaction, we'd rather not
313 # run that at all if there's going to be problem later in `load`.
314 wrapper = yaml.safe_load(stream)
315 if wrapper["version"] == 0:
316 # Grandfather-in 'version: 0' -> 1.0.0, which is what we wrote
317 # before we really tried to do versioning here.
318 fileVersion = VersionTuple(1, 0, 0)
319 else:
320 fileVersion = VersionTuple.fromString(wrapper["version"])
321 if fileVersion.major != EXPORT_FORMAT_VERSION.major:
322 raise IncompatibleVersionError(
323 f"Cannot read repository export file with version={fileVersion} "
324 f"({EXPORT_FORMAT_VERSION.major}.x.x required)."
325 )
326 if fileVersion.minor > EXPORT_FORMAT_VERSION.minor:
327 raise IncompatibleVersionError(
328 f"Cannot read repository export file with version={fileVersion} "
329 f"< {EXPORT_FORMAT_VERSION.major}.{EXPORT_FORMAT_VERSION.minor}.x required."
330 )
331 self.runs: dict[str, tuple[str | None, Timespan]] = {}
332 self.chains: dict[str, list[str]] = {}
333 self.collections: dict[str, CollectionType] = {}
334 self.collectionDocs: dict[str, str] = {}
335 self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
336 self.dimensions: Mapping[DimensionElement, list[DimensionRecord]] = defaultdict(list)
337 self.tagAssociations: dict[str, list[DatasetId]] = defaultdict(list)
338 self.calibAssociations: dict[str, dict[Timespan, list[DatasetId]]] = defaultdict(dict)
339 self.refsByFileId: dict[DatasetId, DatasetRef] = {}
340 self.registry: SqlRegistry = registry
342 universe_version = wrapper.get("universe_version", 0)
343 universe_namespace = wrapper.get("universe_namespace", "daf_butler")
345 # If this is data exported before the reorganization of visits
346 # and visit systems and that new schema is in use, some filtering
347 # will be needed. The entry in the visit dimension record will be
348 # silently dropped when visit is created but the
349 # visit_system_membership must be constructed.
350 migrate_visit_system = False
351 if (
352 universe_version < 2
353 and universe_namespace == "daf_butler"
354 and "visit_system_membership" in self.registry.dimensions
355 ):
356 migrate_visit_system = True
358 # Drop "seeing" from visits in files older than version 1.
359 migrate_visit_seeing = False
360 if (
361 universe_version < 1
362 and universe_namespace == "daf_butler"
363 and "visit" in self.registry.dimensions
364 and "seeing" not in self.registry.dimensions["visit"].metadata
365 ):
366 migrate_visit_seeing = True
368 # If this data exported before group was a first-class dimension,
369 # we'll need to modify some exposure columns and add group records.
370 migrate_group = False
371 if (
372 universe_version < 6
373 and universe_namespace == "daf_butler"
374 and "exposure" in self.registry.dimensions
375 and "group" in self.registry.dimensions["exposure"].implied
376 ):
377 migrate_group = True
379 # If this data exported before day_obs was a first-class dimension,
380 # we'll need to modify some exposure and visit columns and add day_obs
381 # records. This is especially tricky because some files even predate
382 # the existence of data ID values.
383 migrate_exposure_day_obs = False
384 migrate_visit_day_obs = False
385 day_obs_ids: set[tuple[str, int]] = set()
386 if universe_version < 6 and universe_namespace == "daf_butler":
387 if (
388 "exposure" in self.registry.dimensions
389 and "day_obs" in self.registry.dimensions["exposure"].implied
390 ):
391 migrate_exposure_day_obs = True
392 if "visit" in self.registry.dimensions and "day_obs" in self.registry.dimensions["visit"].implied:
393 migrate_visit_day_obs = True
395 # If this is pre-v1 universe we may need to fill in a missing
396 # visit.day_obs field.
397 migrate_add_visit_day_obs = False
398 if (
399 universe_version < 1
400 and universe_namespace == "daf_butler"
401 and (
402 "day_obs" in self.registry.dimensions["visit"].implied
403 or "day_obs" in self.registry.dimensions["visit"].metadata
404 )
405 ):
406 migrate_add_visit_day_obs = True
408 # Some conversions may need to work out a day_obs timespan.
409 # The only way this offset can be found is by querying the instrument
410 # class. Read all the existing instrument classes indexed by name.
411 instrument_classes: dict[str, int] = {}
412 if migrate_exposure_day_obs or migrate_visit_day_obs or migrate_add_visit_day_obs:
413 day_obs_offset_calculator = _DayObsOffsetCalculator()
414 for rec in self.registry.queryDimensionRecords("instrument"):
415 day_obs_offset_calculator[rec.name] = rec.class_name
417 datasetData = []
418 RecordClass: type[DimensionRecord]
419 for data in wrapper["data"]:
420 if data["type"] == "dimension":
421 # convert all datetime values to astropy
422 for record in data["records"]:
423 for key in record:
424 # Some older YAML files were produced with native
425 # YAML support for datetime, we support reading that
426 # data back. Newer conversion uses _AstropyTimeToYAML
427 # class with special YAML tag.
428 if isinstance(record[key], datetime):
429 record[key] = astropy.time.Time(record[key], scale="utc")
431 if data["element"] == "instrument":
432 if migrate_exposure_day_obs or migrate_visit_day_obs:
433 # Might want the instrument class name for later.
434 for record in data["records"]:
435 if record["name"] not in instrument_classes:
436 instrument_classes[record["name"]] = record["class_name"]
438 if data["element"] == "visit":
439 if migrate_visit_system:
440 # Must create the visit_system_membership records.
441 # But first create empty list for visits since other
442 # logic in this file depends on self.dimensions being
443 # populated in an order consisteny with primary keys.
444 self.dimensions[self.registry.dimensions["visit"]] = []
445 element = self.registry.dimensions["visit_system_membership"]
446 RecordClass = element.RecordClass
447 self.dimensions[element].extend(
448 RecordClass(
449 instrument=r["instrument"], visit_system=r.pop("visit_system"), visit=r["id"]
450 )
451 for r in data["records"]
452 )
453 if migrate_visit_seeing:
454 for record in data["records"]:
455 record.pop("seeing", None)
456 if migrate_add_visit_day_obs:
457 # The day_obs field is missing. It can be derived from
458 # the datetime_begin field.
459 for record in data["records"]:
460 date = record["datetime_begin"].tai
461 offset = day_obs_offset_calculator.get_offset(record["instrument"], date)
462 # This field is required so we have to calculate
463 # it even if the offset is not defined.
464 if offset:
465 date = date - astropy.time.TimeDelta(offset, format="sec", scale="tai")
466 record["day_obs"] = int(date.strftime("%Y%m%d"))
467 if migrate_visit_day_obs:
468 # Poke the entry for this dimension to make sure it
469 # appears in the right order, even though we'll
470 # populate it later.
471 self.dimensions[self.registry.dimensions["day_obs"]]
472 for record in data["records"]:
473 day_obs_ids.add((record["instrument"], record["day_obs"]))
475 if data["element"] == "exposure":
476 if migrate_group:
477 element = self.registry.dimensions["group"]
478 RecordClass = element.RecordClass
479 group_records = self.dimensions[element]
480 for exposure_record in data["records"]:
481 exposure_record["group"] = exposure_record.pop("group_name")
482 del exposure_record["group_id"]
483 group_records.append(
484 RecordClass(
485 instrument=exposure_record["instrument"], name=exposure_record["group"]
486 )
487 )
488 if migrate_exposure_day_obs:
489 # Poke the entry for this dimension to make sure it
490 # appears in the right order, even though we'll
491 # populate it later.
492 for record in data["records"]:
493 day_obs_ids.add((record["instrument"], record["day_obs"]))
495 element = self.registry.dimensions[data["element"]]
496 RecordClass = element.RecordClass
497 self.dimensions[element].extend(RecordClass(**r) for r in data["records"])
499 elif data["type"] == "collection":
500 collectionType = CollectionType.from_name(data["collection_type"])
501 if collectionType is CollectionType.RUN:
502 self.runs[data["name"]] = (
503 data["host"],
504 Timespan(begin=data["timespan_begin"], end=data["timespan_end"]),
505 )
506 elif collectionType is CollectionType.CHAINED:
507 children = []
508 for child in data["children"]:
509 if not isinstance(child, str):
510 warnings.warn(
511 f"CHAINED collection {data['name']} includes restrictions on child "
512 "collection searches, which are no longer suppored and will be ignored.",
513 stacklevel=find_outside_stacklevel("lsst.daf.butler"),
514 )
515 # Old form with dataset type restrictions only,
516 # supported for backwards compatibility.
517 child, _ = child
518 children.append(child)
519 self.chains[data["name"]] = children
520 else:
521 self.collections[data["name"]] = collectionType
522 doc = data.get("doc")
523 if doc is not None:
524 self.collectionDocs[data["name"]] = doc
525 elif data["type"] == "run":
526 # Also support old form of saving a run with no extra info.
527 self.runs[data["name"]] = (None, Timespan(None, None))
528 elif data["type"] == "dataset_type":
529 dimensions = data["dimensions"]
530 if migrate_visit_system and "visit" in dimensions and "visit_system" in dimensions:
531 dimensions.remove("visit_system")
532 self.datasetTypes.add(
533 DatasetType(
534 data["name"],
535 dimensions=dimensions,
536 storageClass=data["storage_class"],
537 universe=self.registry.dimensions,
538 isCalibration=data.get("is_calibration", False),
539 )
540 )
541 elif data["type"] == "dataset":
542 # Save raw dataset data for a second loop, so we can ensure we
543 # know about all dataset types first.
544 datasetData.append(data)
545 elif data["type"] == "associations":
546 collectionType = CollectionType.from_name(data["collection_type"])
547 if collectionType is CollectionType.TAGGED:
548 self.tagAssociations[data["collection"]].extend(
549 [x if not isinstance(x, int) else _refIntId2UUID[x] for x in data["dataset_ids"]]
550 )
551 elif collectionType is CollectionType.CALIBRATION:
552 assocsByTimespan = self.calibAssociations[data["collection"]]
553 for d in data["validity_ranges"]:
554 if "timespan" in d:
555 assocsByTimespan[d["timespan"]] = [
556 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"]
557 ]
558 else:
559 # TODO: this is for backward compatibility, should
560 # be removed at some point.
561 assocsByTimespan[Timespan(begin=d["begin"], end=d["end"])] = [
562 x if not isinstance(x, int) else _refIntId2UUID[x] for x in d["dataset_ids"]
563 ]
564 else:
565 raise ValueError(f"Unexpected calibration type for association: {collectionType.name}.")
566 else:
567 raise ValueError(f"Unexpected dictionary type: {data['type']}.")
569 if day_obs_ids:
570 element = self.registry.dimensions["day_obs"]
571 RecordClass = element.RecordClass
572 missing_offsets = set()
573 for instrument, day_obs in day_obs_ids:
574 # To get the offset we need the astropy time. Since we are
575 # going from a day_obs to a time, it's possible that in some
576 # scenario the offset will be wrong.
577 ymd = str(day_obs)
578 t = astropy.time.Time(
579 f"{ymd[0:4]}-{ymd[4:6]}-{ymd[6:8]}T00:00:00", format="isot", scale="tai"
580 )
581 offset = day_obs_offset_calculator.get_offset(instrument, t)
583 # This should always return an offset but as a fallback
584 # allow None here in case something has gone wrong above.
585 # In particular, not being able to load an instrument class.
586 if offset is not None:
587 timespan = Timespan.from_day_obs(day_obs, offset=offset)
588 else:
589 timespan = None
590 missing_offsets.add(instrument)
591 self.dimensions[element].append(
592 RecordClass(instrument=instrument, id=day_obs, timespan=timespan)
593 )
595 if missing_offsets:
596 plural = "" if len(missing_offsets) == 1 else "s"
597 warnings.warn(
598 "Constructing day_obs records with no timespans for "
599 "visit/exposure records that were exported before day_obs was a dimension. "
600 f"(instrument{plural}: {missing_offsets})"
601 )
603 # key is (dataset type name, run)
604 self.datasets: Mapping[tuple[str, str], list[FileDataset]] = defaultdict(list)
605 for data in datasetData:
606 datasetType = self.datasetTypes.get(data["dataset_type"])
607 if datasetType is None:
608 datasetType = self.registry.getDatasetType(data["dataset_type"])
609 self.datasets[data["dataset_type"], data["run"]].extend(
610 FileDataset(
611 d.get("path"),
612 [
613 DatasetRef(
614 datasetType,
615 dataId,
616 run=data["run"],
617 id=refid if not isinstance(refid, int) else _refIntId2UUID[refid],
618 )
619 for dataId, refid in zip(
620 ensure_iterable(d["data_id"]), ensure_iterable(d["dataset_id"]), strict=True
621 )
622 ],
623 formatter=doImportType(d.get("formatter")) if "formatter" in d else None,
624 )
625 for d in data["records"]
626 )
628 def register(self) -> None:
629 # Docstring inherited from RepoImportBackend.register.
630 for datasetType in self.datasetTypes:
631 self.registry.registerDatasetType(datasetType)
632 for run in self.runs:
633 self.registry.registerRun(run, doc=self.collectionDocs.get(run))
634 # No way to add extra run info to registry yet.
635 for collection, collection_type in self.collections.items():
636 self.registry.registerCollection(
637 collection, collection_type, doc=self.collectionDocs.get(collection)
638 )
639 for chain, children in self.chains.items():
640 self.registry.registerCollection(
641 chain, CollectionType.CHAINED, doc=self.collectionDocs.get(chain)
642 )
643 self.registry.setCollectionChain(chain, children)
645 def load(
646 self,
647 datastore: Datastore | None,
648 *,
649 directory: ResourcePathExpression | None = None,
650 transfer: str | None = None,
651 skip_dimensions: set | None = None,
652 ) -> None:
653 # Docstring inherited from RepoImportBackend.load.
654 # Must ensure we insert in order supported by the universe.
655 for element in self.registry.dimensions.sorted(self.dimensions.keys()):
656 dimensionRecords = self.dimensions[element]
657 if skip_dimensions and element in skip_dimensions:
658 continue
659 # Using skip_existing=True here assumes that the records in the
660 # database are either equivalent or at least preferable to the ones
661 # being imported. It'd be ideal to check that, but that would mean
662 # using syncDimensionData, which is not vectorized and is hence
663 # unacceptably slo.
664 self.registry.insertDimensionData(element, *dimensionRecords, skip_existing=True)
665 # FileDatasets to ingest into the datastore (in bulk):
666 fileDatasets = []
667 for records in self.datasets.values():
668 # Make a big flattened list of all data IDs and dataset_ids, while
669 # remembering slices that associate them with the FileDataset
670 # instances they came from.
671 datasets: list[DatasetRef] = []
672 dataset_ids: list[DatasetId] = []
673 slices = []
674 for fileDataset in records:
675 start = len(datasets)
676 datasets.extend(fileDataset.refs)
677 dataset_ids.extend(ref.id for ref in fileDataset.refs)
678 stop = len(datasets)
679 slices.append(slice(start, stop))
680 # Insert all of those DatasetRefs at once.
681 # For now, we ignore the dataset_id we pulled from the file
682 # and just insert without one to get a new autoincrement value.
683 # Eventually (once we have origin in IDs) we'll preserve them.
684 resolvedRefs = self.registry._importDatasets(datasets)
685 # Populate our dictionary that maps int dataset_id values from the
686 # export file to the new DatasetRefs
687 for fileId, ref in zip(dataset_ids, resolvedRefs, strict=True):
688 self.refsByFileId[fileId] = ref
689 # Now iterate over the original records, and install the new
690 # resolved DatasetRefs to replace the unresolved ones as we
691 # reorganize the collection information.
692 for sliceForFileDataset, fileDataset in zip(slices, records, strict=True):
693 fileDataset.refs = resolvedRefs[sliceForFileDataset]
694 if directory is not None:
695 fileDataset.path = ResourcePath(directory, forceDirectory=True).join(fileDataset.path)
696 fileDatasets.append(fileDataset)
697 # Ingest everything into the datastore at once.
698 if datastore is not None and fileDatasets:
699 datastore.ingest(*fileDatasets, transfer=transfer)
700 # Associate datasets with tagged collections.
701 for collection, dataset_ids in self.tagAssociations.items():
702 self.registry.associate(collection, [self.refsByFileId[i] for i in dataset_ids])
703 # Associate datasets with calibration collections.
704 for collection, idsByTimespan in self.calibAssociations.items():
705 for timespan, dataset_ids in idsByTimespan.items():
706 self.registry.certify(collection, [self.refsByFileId[i] for i in dataset_ids], timespan)