Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

1040 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-05 11:07 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Generic file-based datastore code.""" 

29 

30from __future__ import annotations 

31 

32__all__ = ("FileDatastore",) 

33 

34import contextlib 

35import hashlib 

36import logging 

37from collections import defaultdict 

38from collections.abc import Callable, Iterable, Mapping, Sequence 

39from dataclasses import dataclass 

40from typing import TYPE_CHECKING, Any, ClassVar, cast 

41 

42from lsst.daf.butler import ( 

43 Config, 

44 DatasetId, 

45 DatasetRef, 

46 DatasetType, 

47 DatasetTypeNotSupportedError, 

48 FileDataset, 

49 FileDescriptor, 

50 Formatter, 

51 FormatterFactory, 

52 Location, 

53 LocationFactory, 

54 Progress, 

55 StorageClass, 

56 ddl, 

57) 

58from lsst.daf.butler.datastore import ( 

59 DatasetRefURIs, 

60 Datastore, 

61 DatastoreConfig, 

62 DatastoreOpaqueTable, 

63 DatastoreValidationError, 

64) 

65from lsst.daf.butler.datastore.cache_manager import ( 

66 AbstractDatastoreCacheManager, 

67 DatastoreCacheManager, 

68 DatastoreDisabledCacheManager, 

69) 

70from lsst.daf.butler.datastore.composites import CompositesMap 

71from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError 

72from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore 

73from lsst.daf.butler.datastore.record_data import DatastoreRecordData 

74from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo 

75from lsst.daf.butler.registry.interfaces import ( 

76 DatabaseInsertMode, 

77 DatastoreRegistryBridge, 

78 FakeDatasetRef, 

79 ReadOnlyDatabaseError, 

80) 

81from lsst.daf.butler.repo_relocation import replaceRoot 

82from lsst.daf.butler.utils import transactional 

83from lsst.resources import ResourcePath, ResourcePathExpression 

84from lsst.utils.introspection import get_class_of, get_instance_of 

85from lsst.utils.iteration import chunk_iterable 

86 

87# For VERBOSE logging usage. 

88from lsst.utils.logging import VERBOSE, getLogger 

89from lsst.utils.timer import time_this 

90from sqlalchemy import BigInteger, String 

91 

92if TYPE_CHECKING: 

93 from lsst.daf.butler import LookupKey 

94 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

95 

96log = getLogger(__name__) 

97 

98 

99class _IngestPrepData(Datastore.IngestPrepData): 

100 """Helper class for FileDatastore ingest implementation. 

101 

102 Parameters 

103 ---------- 

104 datasets : `~collections.abc.Iterable` of `FileDataset` 

105 Files to be ingested by this datastore. 

106 """ 

107 

108 def __init__(self, datasets: Iterable[FileDataset]): 

109 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

110 self.datasets = datasets 

111 

112 

113@dataclass(frozen=True) 

114class DatastoreFileGetInformation: 

115 """Collection of useful parameters needed to retrieve a file from 

116 a Datastore. 

117 """ 

118 

119 location: Location 

120 """The location from which to read the dataset.""" 

121 

122 formatter: Formatter 

123 """The `Formatter` to use to deserialize the dataset.""" 

124 

125 info: StoredFileInfo 

126 """Stored information about this file and its formatter.""" 

127 

128 assemblerParams: Mapping[str, Any] 

129 """Parameters to use for post-processing the retrieved dataset.""" 

130 

131 formatterParams: Mapping[str, Any] 

132 """Parameters that were understood by the associated formatter.""" 

133 

134 component: str | None 

135 """The component to be retrieved (can be `None`).""" 

136 

137 readStorageClass: StorageClass 

138 """The `StorageClass` of the dataset being read.""" 

139 

140 

141class FileDatastore(GenericBaseDatastore[StoredFileInfo]): 

142 """Generic Datastore for file-based implementations. 

143 

144 Should always be sub-classed since key abstract methods are missing. 

145 

146 Parameters 

147 ---------- 

148 config : `DatastoreConfig` or `str` 

149 Configuration as either a `Config` object or URI to file. 

150 bridgeManager : `DatastoreRegistryBridgeManager` 

151 Object that manages the interface between `Registry` and datastores. 

152 butlerRoot : `str`, optional 

153 New datastore root to use to override the configuration value. 

154 

155 Raises 

156 ------ 

157 ValueError 

158 If root location does not exist and ``create`` is `False` in the 

159 configuration. 

160 """ 

161 

162 defaultConfigFile: ClassVar[str | None] = None 

163 """Path to configuration defaults. Accessed within the ``config`` resource 

164 or relative to a search path. Can be None if no defaults specified. 

165 """ 

166 

167 root: ResourcePath 

168 """Root directory URI of this `Datastore`.""" 

169 

170 locationFactory: LocationFactory 

171 """Factory for creating locations relative to the datastore root.""" 

172 

173 formatterFactory: FormatterFactory 

174 """Factory for creating instances of formatters.""" 

175 

176 templates: FileTemplates 

177 """File templates that can be used by this `Datastore`.""" 

178 

179 composites: CompositesMap 

180 """Determines whether a dataset should be disassembled on put.""" 

181 

182 defaultConfigFile = "datastores/fileDatastore.yaml" 

183 """Path to configuration defaults. Accessed within the ``config`` resource 

184 or relative to a search path. Can be None if no defaults specified. 

185 """ 

186 

187 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

188 """Callable that is used in trusted mode to retrieve registry definition 

189 of a named dataset type. 

190 """ 

191 

192 @classmethod 

193 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

194 """Set any filesystem-dependent config options for this Datastore to 

195 be appropriate for a new empty repository with the given root. 

196 

197 Parameters 

198 ---------- 

199 root : `str` 

200 URI to the root of the data repository. 

201 config : `Config` 

202 A `Config` to update. Only the subset understood by 

203 this component will be updated. Will not expand 

204 defaults. 

205 full : `Config` 

206 A complete config with all defaults expanded that can be 

207 converted to a `DatastoreConfig`. Read-only and will not be 

208 modified by this method. 

209 Repository-specific options that should not be obtained 

210 from defaults when Butler instances are constructed 

211 should be copied from ``full`` to ``config``. 

212 overwrite : `bool`, optional 

213 If `False`, do not modify a value in ``config`` if the value 

214 already exists. Default is always to overwrite with the provided 

215 ``root``. 

216 

217 Notes 

218 ----- 

219 If a keyword is explicitly defined in the supplied ``config`` it 

220 will not be overridden by this method if ``overwrite`` is `False`. 

221 This allows explicit values set in external configs to be retained. 

222 """ 

223 Config.updateParameters( 

224 DatastoreConfig, 

225 config, 

226 full, 

227 toUpdate={"root": root}, 

228 toCopy=("cls", ("records", "table")), 

229 overwrite=overwrite, 

230 ) 

231 

232 @classmethod 

233 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

234 return ddl.TableSpec( 

235 fields=[ 

236 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

237 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

238 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

239 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

240 # Use empty string to indicate no component 

241 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

242 # TODO: should checksum be Base64Bytes instead? 

243 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

244 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

245 ], 

246 unique=frozenset(), 

247 indexes=[ddl.IndexSpec("path")], 

248 ) 

249 

250 def __init__( 

251 self, 

252 config: DatastoreConfig | ResourcePathExpression, 

253 bridgeManager: DatastoreRegistryBridgeManager, 

254 butlerRoot: str | None = None, 

255 ): 

256 super().__init__(config, bridgeManager) 

257 if "root" not in self.config: 

258 raise ValueError("No root directory specified in configuration") 

259 

260 # Name ourselves either using an explicit name or a name 

261 # derived from the (unexpanded) root 

262 if "name" in self.config: 

263 self.name = self.config["name"] 

264 else: 

265 # We use the unexpanded root in the name to indicate that this 

266 # datastore can be moved without having to update registry. 

267 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

268 

269 # Support repository relocation in config 

270 # Existence of self.root is checked in subclass 

271 self.root = ResourcePath( 

272 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

273 ) 

274 

275 self.locationFactory = LocationFactory(self.root) 

276 self.formatterFactory = FormatterFactory() 

277 

278 # Now associate formatters with storage classes 

279 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

280 

281 # Read the file naming templates 

282 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

283 

284 # See if composites should be disassembled 

285 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

286 

287 self._opaque_table_name = self.config["records", "table"] 

288 try: 

289 # Storage of paths and formatters, keyed by dataset_id 

290 self._table = bridgeManager.opaque.register( 

291 self._opaque_table_name, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

292 ) 

293 # Interface to Registry. 

294 self._bridge = bridgeManager.register(self.name) 

295 except ReadOnlyDatabaseError: 

296 # If the database is read only and we just tried and failed to 

297 # create a table, it means someone is trying to create a read-only 

298 # butler client for an empty repo. That should be okay, as long 

299 # as they then try to get any datasets before some other client 

300 # creates the table. Chances are they're just validating 

301 # configuration. 

302 pass 

303 

304 # Determine whether checksums should be used - default to False 

305 self.useChecksum = self.config.get("checksum", False) 

306 

307 # Determine whether we can fall back to configuration if a 

308 # requested dataset is not known to registry 

309 self.trustGetRequest = self.config.get("trust_get_request", False) 

310 

311 # Create a cache manager 

312 self.cacheManager: AbstractDatastoreCacheManager 

313 if "cached" in self.config: 

314 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

315 else: 

316 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

317 

318 # Check existence and create directory structure if necessary 

319 if not self.root.exists(): 

320 if "create" not in self.config or not self.config["create"]: 

321 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

322 try: 

323 self.root.mkdir() 

324 except Exception as e: 

325 raise ValueError( 

326 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

327 ) from e 

328 

329 def __str__(self) -> str: 

330 return str(self.root) 

331 

332 @property 

333 def bridge(self) -> DatastoreRegistryBridge: 

334 return self._bridge 

335 

336 @property 

337 def roots(self) -> dict[str, ResourcePath | None]: 

338 # Docstring inherited. 

339 return {self.name: self.root} 

340 

341 def _artifact_exists(self, location: Location) -> bool: 

342 """Check that an artifact exists in this datastore at the specified 

343 location. 

344 

345 Parameters 

346 ---------- 

347 location : `Location` 

348 Expected location of the artifact associated with this datastore. 

349 

350 Returns 

351 ------- 

352 exists : `bool` 

353 True if the location can be found, false otherwise. 

354 """ 

355 log.debug("Checking if resource exists: %s", location.uri) 

356 return location.uri.exists() 

357 

358 def _delete_artifact(self, location: Location) -> None: 

359 """Delete the artifact from the datastore. 

360 

361 Parameters 

362 ---------- 

363 location : `Location` 

364 Location of the artifact associated with this datastore. 

365 """ 

366 if location.pathInStore.isabs(): 

367 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

368 

369 try: 

370 location.uri.remove() 

371 except FileNotFoundError: 

372 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

373 raise 

374 except Exception as e: 

375 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

376 raise 

377 log.debug("Successfully deleted file: %s", location.uri) 

378 

379 def addStoredItemInfo( 

380 self, 

381 refs: Iterable[DatasetRef], 

382 infos: Iterable[StoredFileInfo], 

383 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

384 ) -> None: 

385 """Record internal storage information associated with one or more 

386 datasets. 

387 

388 Parameters 

389 ---------- 

390 refs : sequence of `DatasetRef` 

391 The datasets that have been stored. 

392 infos : sequence of `StoredDatastoreItemInfo` 

393 Metadata associated with the stored datasets. 

394 insert_mode : `~lsst.daf.butler.registry.interfaces.DatabaseInsertMode` 

395 Mode to use to insert the new records into the table. The 

396 options are ``INSERT`` (error if pre-existing), ``REPLACE`` 

397 (replace content with new values), and ``ENSURE`` (skip if the row 

398 already exists). 

399 """ 

400 records = [ 

401 info.rebase(ref).to_record(dataset_id=ref.id) for ref, info in zip(refs, infos, strict=True) 

402 ] 

403 match insert_mode: 

404 case DatabaseInsertMode.INSERT: 

405 self._table.insert(*records, transaction=self._transaction) 

406 case DatabaseInsertMode.ENSURE: 

407 self._table.ensure(*records, transaction=self._transaction) 

408 case DatabaseInsertMode.REPLACE: 

409 self._table.replace(*records, transaction=self._transaction) 

410 case _: 

411 raise ValueError(f"Unknown insert mode of '{insert_mode}'") 

412 

413 def getStoredItemsInfo( 

414 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

415 ) -> list[StoredFileInfo]: 

416 """Retrieve information associated with files stored in this 

417 `Datastore` associated with this dataset ref. 

418 

419 Parameters 

420 ---------- 

421 ref : `DatasetRef` 

422 The dataset that is to be queried. 

423 ignore_datastore_records : `bool` 

424 If `True` then do not use datastore records stored in refs. 

425 

426 Returns 

427 ------- 

428 items : `~collections.abc.Iterable` [`StoredDatastoreItemInfo`] 

429 Stored information about the files and associated formatters 

430 associated with this dataset. Only one file will be returned 

431 if the dataset has not been disassembled. Can return an empty 

432 list if no matching datasets can be found. 

433 """ 

434 # Try to get them from the ref first. 

435 if ref._datastore_records is not None and not ignore_datastore_records: 

436 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

437 # Need to make sure they have correct type. 

438 for record in ref_records: 

439 if not isinstance(record, StoredFileInfo): 

440 raise TypeError(f"Datastore record has unexpected type {record.__class__.__name__}") 

441 return cast(list[StoredFileInfo], ref_records) 

442 

443 # Look for the dataset_id -- there might be multiple matches 

444 # if we have disassembled the dataset. 

445 records = self._table.fetch(dataset_id=ref.id) 

446 return [StoredFileInfo.from_record(record) for record in records] 

447 

448 def _register_datasets( 

449 self, 

450 refsAndInfos: Iterable[tuple[DatasetRef, StoredFileInfo]], 

451 insert_mode: DatabaseInsertMode = DatabaseInsertMode.INSERT, 

452 ) -> None: 

453 """Update registry to indicate that one or more datasets have been 

454 stored. 

455 

456 Parameters 

457 ---------- 

458 refsAndInfos : sequence `tuple` [`DatasetRef`, 

459 `StoredDatastoreItemInfo`] 

460 Datasets to register and the internal datastore metadata associated 

461 with them. 

462 insert_mode : `str`, optional 

463 Indicate whether the new records should be new ("insert", default), 

464 or allowed to exists ("ensure") or be replaced if already present 

465 ("replace"). 

466 """ 

467 expandedRefs: list[DatasetRef] = [] 

468 expandedItemInfos: list[StoredFileInfo] = [] 

469 

470 for ref, itemInfo in refsAndInfos: 

471 expandedRefs.append(ref) 

472 expandedItemInfos.append(itemInfo) 

473 

474 # Dataset location only cares about registry ID so if we have 

475 # disassembled in datastore we have to deduplicate. Since they 

476 # will have different datasetTypes we can't use a set 

477 registryRefs = {r.id: r for r in expandedRefs} 

478 if insert_mode == DatabaseInsertMode.INSERT: 

479 self.bridge.insert(registryRefs.values()) 

480 else: 

481 # There are only two columns and all that matters is the 

482 # dataset ID. 

483 self.bridge.ensure(registryRefs.values()) 

484 self.addStoredItemInfo(expandedRefs, expandedItemInfos, insert_mode=insert_mode) 

485 

486 def _get_stored_records_associated_with_refs( 

487 self, refs: Iterable[DatasetIdRef], ignore_datastore_records: bool = False 

488 ) -> dict[DatasetId, list[StoredFileInfo]]: 

489 """Retrieve all records associated with the provided refs. 

490 

491 Parameters 

492 ---------- 

493 refs : iterable of `DatasetIdRef` 

494 The refs for which records are to be retrieved. 

495 ignore_datastore_records : `bool` 

496 If `True` then do not use datastore records stored in refs. 

497 

498 Returns 

499 ------- 

500 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

501 The matching records indexed by the ref ID. The number of entries 

502 in the dict can be smaller than the number of requested refs. 

503 """ 

504 # Check datastore records in refs first. 

505 records_by_ref: defaultdict[DatasetId, list[StoredFileInfo]] = defaultdict(list) 

506 refs_with_no_records = [] 

507 for ref in refs: 

508 if ignore_datastore_records or ref._datastore_records is None: 

509 refs_with_no_records.append(ref) 

510 else: 

511 if (ref_records := ref._datastore_records.get(self._table.name)) is not None: 

512 # Need to make sure they have correct type. 

513 for ref_record in ref_records: 

514 if not isinstance(ref_record, StoredFileInfo): 

515 raise TypeError( 

516 f"Datastore record has unexpected type {ref_record.__class__.__name__}" 

517 ) 

518 records_by_ref[ref.id].append(ref_record) 

519 

520 # If there were any refs without datastore records, check opaque table. 

521 records = self._table.fetch(dataset_id=[ref.id for ref in refs_with_no_records]) 

522 

523 # Uniqueness is dataset_id + component so can have multiple records 

524 # per ref. 

525 for record in records: 

526 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

527 return records_by_ref 

528 

529 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

530 """Return paths and associated dataset refs. 

531 

532 Parameters 

533 ---------- 

534 paths : `list` of `str` or `lsst.resources.ResourcePath` 

535 All the paths to include in search. 

536 

537 Returns 

538 ------- 

539 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

540 Mapping of each path to a set of associated database IDs. 

541 """ 

542 records = self._table.fetch(path=[str(path) for path in paths]) 

543 result = defaultdict(set) 

544 for row in records: 

545 result[row["path"]].add(row["dataset_id"]) 

546 return result 

547 

548 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

549 """Return all dataset refs associated with the supplied path. 

550 

551 Parameters 

552 ---------- 

553 pathInStore : `lsst.resources.ResourcePath` 

554 Path of interest in the data store. 

555 

556 Returns 

557 ------- 

558 ids : `set` of `int` 

559 All `DatasetRef` IDs associated with this path. 

560 """ 

561 records = list(self._table.fetch(path=str(pathInStore))) 

562 ids = {r["dataset_id"] for r in records} 

563 return ids 

564 

565 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

566 """Remove information about the file associated with this dataset. 

567 

568 Parameters 

569 ---------- 

570 ref : `DatasetRef` 

571 The dataset that has been removed. 

572 """ 

573 # Note that this method is actually not used by this implementation, 

574 # we depend on bridge to delete opaque records. But there are some 

575 # tests that check that this method works, so we keep it for now. 

576 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

577 

578 def _get_dataset_locations_info( 

579 self, ref: DatasetIdRef, ignore_datastore_records: bool = False 

580 ) -> list[tuple[Location, StoredFileInfo]]: 

581 r"""Find all the `Location`\ s of the requested dataset in the 

582 `Datastore` and the associated stored file information. 

583 

584 Parameters 

585 ---------- 

586 ref : `DatasetRef` 

587 Reference to the required `Dataset`. 

588 ignore_datastore_records : `bool` 

589 If `True` then do not use datastore records stored in refs. 

590 

591 Returns 

592 ------- 

593 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

594 Location of the dataset within the datastore and 

595 stored information about each file and its formatter. 

596 """ 

597 # Get the file information (this will fail if no file) 

598 records = self.getStoredItemsInfo(ref, ignore_datastore_records) 

599 

600 # Use the path to determine the location -- we need to take 

601 # into account absolute URIs in the datastore record 

602 return [(r.file_location(self.locationFactory), r) for r in records] 

603 

604 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

605 """Check that there is only one dataset associated with the 

606 specified artifact. 

607 

608 Parameters 

609 ---------- 

610 ref : `DatasetRef` or `FakeDatasetRef` 

611 Dataset to be removed. 

612 location : `Location` 

613 The location of the artifact to be removed. 

614 

615 Returns 

616 ------- 

617 can_remove : `Bool` 

618 True if the artifact can be safely removed. 

619 """ 

620 # Can't ever delete absolute URIs. 

621 if location.pathInStore.isabs(): 

622 return False 

623 

624 # Get all entries associated with this path 

625 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

626 if not allRefs: 

627 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

628 

629 # Remove these refs from all the refs and if there is nothing left 

630 # then we can delete 

631 remainingRefs = allRefs - {ref.id} 

632 

633 if remainingRefs: 

634 return False 

635 return True 

636 

637 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

638 """Predict the location and related file information of the requested 

639 dataset in this datastore. 

640 

641 Parameters 

642 ---------- 

643 ref : `DatasetRef` 

644 Reference to the required `Dataset`. 

645 

646 Returns 

647 ------- 

648 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

649 Expected Location of the dataset within the datastore and 

650 placeholder information about each file and its formatter. 

651 

652 Notes 

653 ----- 

654 Uses the current configuration to determine how we would expect the 

655 datastore files to have been written if we couldn't ask registry. 

656 This is safe so long as there has been no change to datastore 

657 configuration between writing the dataset and wanting to read it. 

658 Will not work for files that have been ingested without using the 

659 standard file template or default formatter. 

660 """ 

661 # If we have a component ref we always need to ask the questions 

662 # of the composite. If the composite is disassembled this routine 

663 # should return all components. If the composite was not 

664 # disassembled the composite is what is stored regardless of 

665 # component request. Note that if the caller has disassembled 

666 # a composite there is no way for this guess to know that 

667 # without trying both the composite and component ref and seeing 

668 # if there is something at the component Location even without 

669 # disassembly being enabled. 

670 if ref.datasetType.isComponent(): 

671 ref = ref.makeCompositeRef() 

672 

673 # See if the ref is a composite that should be disassembled 

674 doDisassembly = self.composites.shouldBeDisassembled(ref) 

675 

676 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

677 

678 if doDisassembly: 

679 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

680 compRef = ref.makeComponentRef(component) 

681 location, formatter = self._determine_put_formatter_location(compRef) 

682 all_info.append((location, formatter, componentStorage, component)) 

683 

684 else: 

685 # Always use the composite ref if no disassembly 

686 location, formatter = self._determine_put_formatter_location(ref) 

687 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

688 

689 # Convert the list of tuples to have StoredFileInfo as second element 

690 return [ 

691 ( 

692 location, 

693 StoredFileInfo( 

694 formatter=formatter, 

695 path=location.pathInStore.path, 

696 storageClass=storageClass, 

697 component=component, 

698 checksum=None, 

699 file_size=-1, 

700 ), 

701 ) 

702 for location, formatter, storageClass, component in all_info 

703 ] 

704 

705 def _prepare_for_get( 

706 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

707 ) -> list[DatastoreFileGetInformation]: 

708 """Check parameters for ``get`` and obtain formatter and 

709 location. 

710 

711 Parameters 

712 ---------- 

713 ref : `DatasetRef` 

714 Reference to the required Dataset. 

715 parameters : `dict` 

716 `StorageClass`-specific parameters that specify, for example, 

717 a slice of the dataset to be loaded. 

718 

719 Returns 

720 ------- 

721 getInfo : `list` [`DatastoreFileGetInformation`] 

722 Parameters needed to retrieve each file. 

723 """ 

724 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

725 

726 # The storage class we want to use eventually 

727 refStorageClass = ref.datasetType.storageClass 

728 

729 # For trusted mode need to reset storage class. 

730 ref = self._cast_storage_class(ref) 

731 

732 # Get file metadata and internal metadata 

733 fileLocations = self._get_dataset_locations_info(ref) 

734 if not fileLocations: 

735 if not self.trustGetRequest: 

736 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

737 # Assume the dataset is where we think it should be 

738 fileLocations = self._get_expected_dataset_locations_info(ref) 

739 

740 if len(fileLocations) > 1: 

741 disassembled = True 

742 

743 # If trust is involved it is possible that there will be 

744 # components listed here that do not exist in the datastore. 

745 # Explicitly check for file artifact existence and filter out any 

746 # that are missing. 

747 if self.trustGetRequest: 

748 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

749 

750 # For now complain only if we have no components at all. One 

751 # component is probably a problem but we can punt that to the 

752 # assembler. 

753 if not fileLocations: 

754 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

755 

756 else: 

757 disassembled = False 

758 

759 # Is this a component request? 

760 refComponent = ref.datasetType.component() 

761 

762 fileGetInfo = [] 

763 for location, storedFileInfo in fileLocations: 

764 # The storage class used to write the file 

765 writeStorageClass = storedFileInfo.storageClass 

766 

767 # If this has been disassembled we need read to match the write 

768 if disassembled: 

769 readStorageClass = writeStorageClass 

770 else: 

771 readStorageClass = refStorageClass 

772 

773 formatter = get_instance_of( 

774 storedFileInfo.formatter, 

775 FileDescriptor( 

776 location, 

777 readStorageClass=readStorageClass, 

778 storageClass=writeStorageClass, 

779 parameters=parameters, 

780 ), 

781 ref.dataId, 

782 ) 

783 

784 formatterParams, notFormatterParams = formatter.segregateParameters() 

785 

786 # Of the remaining parameters, extract the ones supported by 

787 # this StorageClass (for components not all will be handled) 

788 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

789 

790 # The ref itself could be a component if the dataset was 

791 # disassembled by butler, or we disassembled in datastore and 

792 # components came from the datastore records 

793 component = storedFileInfo.component if storedFileInfo.component else refComponent 

794 

795 fileGetInfo.append( 

796 DatastoreFileGetInformation( 

797 location, 

798 formatter, 

799 storedFileInfo, 

800 assemblerParams, 

801 formatterParams, 

802 component, 

803 readStorageClass, 

804 ) 

805 ) 

806 

807 return fileGetInfo 

808 

809 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

810 """Check the arguments for ``put`` and obtain formatter and 

811 location. 

812 

813 Parameters 

814 ---------- 

815 inMemoryDataset : `object` 

816 The dataset to store. 

817 ref : `DatasetRef` 

818 Reference to the associated Dataset. 

819 

820 Returns 

821 ------- 

822 location : `Location` 

823 The location to write the dataset. 

824 formatter : `Formatter` 

825 The `Formatter` to use to write the dataset. 

826 

827 Raises 

828 ------ 

829 TypeError 

830 Supplied object and storage class are inconsistent. 

831 DatasetTypeNotSupportedError 

832 The associated `DatasetType` is not handled by this datastore. 

833 """ 

834 self._validate_put_parameters(inMemoryDataset, ref) 

835 return self._determine_put_formatter_location(ref) 

836 

837 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

838 """Calculate the formatter and output location to use for put. 

839 

840 Parameters 

841 ---------- 

842 ref : `DatasetRef` 

843 Reference to the associated Dataset. 

844 

845 Returns 

846 ------- 

847 location : `Location` 

848 The location to write the dataset. 

849 formatter : `Formatter` 

850 The `Formatter` to use to write the dataset. 

851 """ 

852 # Work out output file name 

853 try: 

854 template = self.templates.getTemplate(ref) 

855 except KeyError as e: 

856 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

857 

858 # Validate the template to protect against filenames from different 

859 # dataIds returning the same and causing overwrite confusion. 

860 template.validateTemplate(ref) 

861 

862 location = self.locationFactory.fromPath(template.format(ref)) 

863 

864 # Get the formatter based on the storage class 

865 storageClass = ref.datasetType.storageClass 

866 try: 

867 formatter = self.formatterFactory.getFormatter( 

868 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

869 ) 

870 except KeyError as e: 

871 raise DatasetTypeNotSupportedError( 

872 f"Unable to find formatter for {ref} in datastore {self.name}" 

873 ) from e 

874 

875 # Now that we know the formatter, update the location 

876 location = formatter.makeUpdatedLocation(location) 

877 

878 return location, formatter 

879 

880 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

881 # Docstring inherited from base class 

882 if transfer != "auto": 

883 return transfer 

884 

885 # See if the paths are within the datastore or not 

886 inside = [self._pathInStore(d.path) is not None for d in datasets] 

887 

888 if all(inside): 

889 transfer = None 

890 elif not any(inside): 

891 # Allow ResourcePath to use its own knowledge 

892 transfer = "auto" 

893 else: 

894 # This can happen when importing from a datastore that 

895 # has had some datasets ingested using "direct" mode. 

896 # Also allow ResourcePath to sort it out but warn about it. 

897 # This can happen if you are importing from a datastore 

898 # that had some direct transfer datasets. 

899 log.warning( 

900 "Some datasets are inside the datastore and some are outside. Using 'split' " 

901 "transfer mode. This assumes that the files outside the datastore are " 

902 "still accessible to the new butler since they will not be copied into " 

903 "the target datastore." 

904 ) 

905 transfer = "split" 

906 

907 return transfer 

908 

909 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

910 """Return path relative to datastore root. 

911 

912 Parameters 

913 ---------- 

914 path : `lsst.resources.ResourcePathExpression` 

915 Path to dataset. Can be absolute URI. If relative assumed to 

916 be relative to the datastore. Returns path in datastore 

917 or raises an exception if the path it outside. 

918 

919 Returns 

920 ------- 

921 inStore : `str` 

922 Path relative to datastore root. Returns `None` if the file is 

923 outside the root. 

924 """ 

925 # Relative path will always be relative to datastore 

926 pathUri = ResourcePath(path, forceAbsolute=False) 

927 return pathUri.relative_to(self.root) 

928 

929 def _standardizeIngestPath( 

930 self, path: str | ResourcePath, *, transfer: str | None = None 

931 ) -> str | ResourcePath: 

932 """Standardize the path of a to-be-ingested file. 

933 

934 Parameters 

935 ---------- 

936 path : `str` or `lsst.resources.ResourcePath` 

937 Path of a file to be ingested. This parameter is not expected 

938 to be all the types that can be used to construct a 

939 `~lsst.resources.ResourcePath`. 

940 transfer : `str`, optional 

941 How (and whether) the dataset should be added to the datastore. 

942 See `ingest` for details of transfer modes. 

943 This implementation is provided only so 

944 `NotImplementedError` can be raised if the mode is not supported; 

945 actual transfers are deferred to `_extractIngestInfo`. 

946 

947 Returns 

948 ------- 

949 path : `str` or `lsst.resources.ResourcePath` 

950 New path in what the datastore considers standard form. If an 

951 absolute URI was given that will be returned unchanged. 

952 

953 Notes 

954 ----- 

955 Subclasses of `FileDatastore` can implement this method instead 

956 of `_prepIngest`. It should not modify the data repository or given 

957 file in any way. 

958 

959 Raises 

960 ------ 

961 NotImplementedError 

962 Raised if the datastore does not support the given transfer mode 

963 (including the case where ingest is not supported at all). 

964 FileNotFoundError 

965 Raised if one of the given files does not exist. 

966 """ 

967 if transfer not in (None, "direct", "split") + self.root.transferModes: 

968 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

969 

970 # A relative URI indicates relative to datastore root 

971 srcUri = ResourcePath(path, forceAbsolute=False) 

972 if not srcUri.isabs(): 

973 srcUri = self.root.join(path) 

974 

975 if not srcUri.exists(): 

976 raise FileNotFoundError( 

977 f"Resource at {srcUri} does not exist; note that paths to ingest " 

978 f"are assumed to be relative to {self.root} unless they are absolute." 

979 ) 

980 

981 if transfer is None: 

982 relpath = srcUri.relative_to(self.root) 

983 if not relpath: 

984 raise RuntimeError( 

985 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

986 ) 

987 

988 # Return the relative path within the datastore for internal 

989 # transfer 

990 path = relpath 

991 

992 return path 

993 

994 def _extractIngestInfo( 

995 self, 

996 path: ResourcePathExpression, 

997 ref: DatasetRef, 

998 *, 

999 formatter: Formatter | type[Formatter], 

1000 transfer: str | None = None, 

1001 record_validation_info: bool = True, 

1002 ) -> StoredFileInfo: 

1003 """Relocate (if necessary) and extract `StoredFileInfo` from a 

1004 to-be-ingested file. 

1005 

1006 Parameters 

1007 ---------- 

1008 path : `lsst.resources.ResourcePathExpression` 

1009 URI or path of a file to be ingested. 

1010 ref : `DatasetRef` 

1011 Reference for the dataset being ingested. Guaranteed to have 

1012 ``dataset_id not None`. 

1013 formatter : `type` or `Formatter` 

1014 `Formatter` subclass to use for this dataset or an instance. 

1015 transfer : `str`, optional 

1016 How (and whether) the dataset should be added to the datastore. 

1017 See `ingest` for details of transfer modes. 

1018 record_validation_info : `bool`, optional 

1019 If `True`, the default, the datastore can record validation 

1020 information associated with the file. If `False` the datastore 

1021 will not attempt to track any information such as checksums 

1022 or file sizes. This can be useful if such information is tracked 

1023 in an external system or if the file is to be compressed in place. 

1024 It is up to the datastore whether this parameter is relevant. 

1025 

1026 Returns 

1027 ------- 

1028 info : `StoredFileInfo` 

1029 Internal datastore record for this file. This will be inserted by 

1030 the caller; the `_extractIngestInfo` is only responsible for 

1031 creating and populating the struct. 

1032 

1033 Raises 

1034 ------ 

1035 FileNotFoundError 

1036 Raised if one of the given files does not exist. 

1037 FileExistsError 

1038 Raised if transfer is not `None` but the (internal) location the 

1039 file would be moved to is already occupied. 

1040 """ 

1041 if self._transaction is None: 

1042 raise RuntimeError("Ingest called without transaction enabled") 

1043 

1044 # Create URI of the source path, do not need to force a relative 

1045 # path to absolute. 

1046 srcUri = ResourcePath(path, forceAbsolute=False) 

1047 

1048 # Track whether we have read the size of the source yet 

1049 have_sized = False 

1050 

1051 tgtLocation: Location | None 

1052 if transfer is None or transfer == "split": 

1053 # A relative path is assumed to be relative to the datastore 

1054 # in this context 

1055 if not srcUri.isabs(): 

1056 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

1057 else: 

1058 # Work out the path in the datastore from an absolute URI 

1059 # This is required to be within the datastore. 

1060 pathInStore = srcUri.relative_to(self.root) 

1061 if pathInStore is None and transfer is None: 

1062 raise RuntimeError( 

1063 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

1064 ) 

1065 if pathInStore: 

1066 tgtLocation = self.locationFactory.fromPath(pathInStore) 

1067 elif transfer == "split": 

1068 # Outside the datastore but treat that as a direct ingest 

1069 # instead. 

1070 tgtLocation = None 

1071 else: 

1072 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

1073 elif transfer == "direct": 

1074 # Want to store the full URI to the resource directly in 

1075 # datastore. This is useful for referring to permanent archive 

1076 # storage for raw data. 

1077 # Trust that people know what they are doing. 

1078 tgtLocation = None 

1079 else: 

1080 # Work out the name we want this ingested file to have 

1081 # inside the datastore 

1082 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

1083 if not tgtLocation.uri.dirname().exists(): 

1084 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

1085 tgtLocation.uri.dirname().mkdir() 

1086 

1087 # if we are transferring from a local file to a remote location 

1088 # it may be more efficient to get the size and checksum of the 

1089 # local file rather than the transferred one 

1090 if record_validation_info and srcUri.isLocal: 

1091 size = srcUri.size() 

1092 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

1093 have_sized = True 

1094 

1095 # Transfer the resource to the destination. 

1096 # Allow overwrite of an existing file. This matches the behavior 

1097 # of datastore.put() in that it trusts that registry would not 

1098 # be asking to overwrite unless registry thought that the 

1099 # overwrite was allowed. 

1100 tgtLocation.uri.transfer_from( 

1101 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

1102 ) 

1103 

1104 if tgtLocation is None: 

1105 # This means we are using direct mode 

1106 targetUri = srcUri 

1107 targetPath = str(srcUri) 

1108 else: 

1109 targetUri = tgtLocation.uri 

1110 targetPath = tgtLocation.pathInStore.path 

1111 

1112 # the file should exist in the datastore now 

1113 if record_validation_info: 

1114 if not have_sized: 

1115 size = targetUri.size() 

1116 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

1117 else: 

1118 # Not recording any file information. 

1119 size = -1 

1120 checksum = None 

1121 

1122 return StoredFileInfo( 

1123 formatter=formatter, 

1124 path=targetPath, 

1125 storageClass=ref.datasetType.storageClass, 

1126 component=ref.datasetType.component(), 

1127 file_size=size, 

1128 checksum=checksum, 

1129 ) 

1130 

1131 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

1132 # Docstring inherited from Datastore._prepIngest. 

1133 filtered = [] 

1134 for dataset in datasets: 

1135 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1136 if not acceptable: 

1137 continue 

1138 else: 

1139 dataset.refs = acceptable 

1140 if dataset.formatter is None: 

1141 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1142 else: 

1143 assert isinstance(dataset.formatter, type | str) 

1144 formatter_class = get_class_of(dataset.formatter) 

1145 if not issubclass(formatter_class, Formatter): 

1146 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1147 dataset.formatter = formatter_class 

1148 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1149 filtered.append(dataset) 

1150 return _IngestPrepData(filtered) 

1151 

1152 @transactional 

1153 def _finishIngest( 

1154 self, 

1155 prepData: Datastore.IngestPrepData, 

1156 *, 

1157 transfer: str | None = None, 

1158 record_validation_info: bool = True, 

1159 ) -> None: 

1160 # Docstring inherited from Datastore._finishIngest. 

1161 refsAndInfos = [] 

1162 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1163 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1164 # Do ingest as if the first dataset ref is associated with the file 

1165 info = self._extractIngestInfo( 

1166 dataset.path, 

1167 dataset.refs[0], 

1168 formatter=dataset.formatter, 

1169 transfer=transfer, 

1170 record_validation_info=record_validation_info, 

1171 ) 

1172 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1173 

1174 # In direct mode we can allow repeated ingests of the same thing 

1175 # if we are sure that the external dataset is immutable. We use 

1176 # UUIDv5 to indicate this. If there is a mix of v4 and v5 they are 

1177 # separated. 

1178 refs_and_infos_replace = [] 

1179 refs_and_infos_insert = [] 

1180 if transfer == "direct": 

1181 for entry in refsAndInfos: 

1182 if entry[0].id.version == 5: 

1183 refs_and_infos_replace.append(entry) 

1184 else: 

1185 refs_and_infos_insert.append(entry) 

1186 else: 

1187 refs_and_infos_insert = refsAndInfos 

1188 

1189 if refs_and_infos_insert: 

1190 self._register_datasets(refs_and_infos_insert, insert_mode=DatabaseInsertMode.INSERT) 

1191 if refs_and_infos_replace: 

1192 self._register_datasets(refs_and_infos_replace, insert_mode=DatabaseInsertMode.REPLACE) 

1193 

1194 def _calculate_ingested_datastore_name( 

1195 self, 

1196 srcUri: ResourcePath, 

1197 ref: DatasetRef, 

1198 formatter: Formatter | type[Formatter] | None = None, 

1199 ) -> Location: 

1200 """Given a source URI and a DatasetRef, determine the name the 

1201 dataset will have inside datastore. 

1202 

1203 Parameters 

1204 ---------- 

1205 srcUri : `lsst.resources.ResourcePath` 

1206 URI to the source dataset file. 

1207 ref : `DatasetRef` 

1208 Ref associated with the newly-ingested dataset artifact. This 

1209 is used to determine the name within the datastore. 

1210 formatter : `Formatter` or Formatter class. 

1211 Formatter to use for validation. Can be a class or an instance. 

1212 No validation of the file extension is performed if the 

1213 ``formatter`` is `None`. This can be used if the caller knows 

1214 that the source URI and target URI will use the same formatter. 

1215 

1216 Returns 

1217 ------- 

1218 location : `Location` 

1219 Target location for the newly-ingested dataset. 

1220 """ 

1221 # Ingesting a file from outside the datastore. 

1222 # This involves a new name. 

1223 template = self.templates.getTemplate(ref) 

1224 location = self.locationFactory.fromPath(template.format(ref)) 

1225 

1226 # Get the extension 

1227 ext = srcUri.getExtension() 

1228 

1229 # Update the destination to include that extension 

1230 location.updateExtension(ext) 

1231 

1232 # Ask the formatter to validate this extension 

1233 if formatter is not None: 

1234 formatter.validateExtension(location) 

1235 

1236 return location 

1237 

1238 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1239 """Write out in memory dataset to datastore. 

1240 

1241 Parameters 

1242 ---------- 

1243 inMemoryDataset : `object` 

1244 Dataset to write to datastore. 

1245 ref : `DatasetRef` 

1246 Registry information associated with this dataset. 

1247 

1248 Returns 

1249 ------- 

1250 info : `StoredFileInfo` 

1251 Information describing the artifact written to the datastore. 

1252 """ 

1253 # May need to coerce the in memory dataset to the correct 

1254 # python type, but first we need to make sure the storage class 

1255 # reflects the one defined in the data repository. 

1256 ref = self._cast_storage_class(ref) 

1257 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1258 

1259 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1260 uri = location.uri 

1261 

1262 if not uri.dirname().exists(): 

1263 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1264 uri.dirname().mkdir() 

1265 

1266 if self._transaction is None: 

1267 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1268 

1269 def _removeFileExists(uri: ResourcePath) -> None: 

1270 """Remove a file and do not complain if it is not there. 

1271 

1272 This is important since a formatter might fail before the file 

1273 is written and we should not confuse people by writing spurious 

1274 error messages to the log. 

1275 """ 

1276 with contextlib.suppress(FileNotFoundError): 

1277 uri.remove() 

1278 

1279 # Register a callback to try to delete the uploaded data if 

1280 # something fails below 

1281 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1282 

1283 data_written = False 

1284 

1285 # For remote URIs some datasets can be serialized directly 

1286 # to bytes and sent to the remote datastore without writing a 

1287 # file. If the dataset is intended to be saved to the cache 

1288 # a file is always written and direct write to the remote 

1289 # datastore is bypassed. 

1290 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1291 # Remote URI that is not cached so can write directly. 

1292 try: 

1293 serializedDataset = formatter.toBytes(inMemoryDataset) 

1294 except NotImplementedError: 

1295 # Fallback to the file writing option. 

1296 pass 

1297 except Exception as e: 

1298 raise RuntimeError( 

1299 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1300 ) from e 

1301 else: 

1302 log.debug("Writing bytes directly to %s", uri) 

1303 uri.write(serializedDataset, overwrite=True) 

1304 log.debug("Successfully wrote bytes directly to %s", uri) 

1305 data_written = True 

1306 

1307 if not data_written: 

1308 # Did not write the bytes directly to object store so instead 

1309 # write to temporary file. Always write to a temporary even if 

1310 # using a local file system -- that gives us atomic writes. 

1311 # If a process is killed as the file is being written we do not 

1312 # want it to remain in the correct place but in corrupt state. 

1313 # For local files write to the output directory not temporary dir. 

1314 prefix = uri.dirname() if uri.isLocal else None 

1315 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1316 # Need to configure the formatter to write to a different 

1317 # location and that needs us to overwrite internals 

1318 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1319 with formatter._updateLocation(Location(None, temporary_uri)): 

1320 try: 

1321 formatter.write(inMemoryDataset) 

1322 except Exception as e: 

1323 raise RuntimeError( 

1324 f"Failed to serialize dataset {ref} of type" 

1325 f" {type(inMemoryDataset)} to " 

1326 f"temporary location {temporary_uri}" 

1327 ) from e 

1328 

1329 # Use move for a local file since that becomes an efficient 

1330 # os.rename. For remote resources we use copy to allow the 

1331 # file to be cached afterwards. 

1332 transfer = "move" if uri.isLocal else "copy" 

1333 

1334 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1335 

1336 if transfer == "copy": 

1337 # Cache if required 

1338 self.cacheManager.move_to_cache(temporary_uri, ref) 

1339 

1340 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1341 

1342 # URI is needed to resolve what ingest case are we dealing with 

1343 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1344 

1345 def _read_artifact_into_memory( 

1346 self, 

1347 getInfo: DatastoreFileGetInformation, 

1348 ref: DatasetRef, 

1349 isComponent: bool = False, 

1350 cache_ref: DatasetRef | None = None, 

1351 ) -> Any: 

1352 """Read the artifact from datastore into in memory object. 

1353 

1354 Parameters 

1355 ---------- 

1356 getInfo : `DatastoreFileGetInformation` 

1357 Information about the artifact within the datastore. 

1358 ref : `DatasetRef` 

1359 The registry information associated with this artifact. 

1360 isComponent : `bool` 

1361 Flag to indicate if a component is being read from this artifact. 

1362 cache_ref : `DatasetRef`, optional 

1363 The DatasetRef to use when looking up the file in the cache. 

1364 This ref must have the same ID as the supplied ref but can 

1365 be a parent ref or component ref to indicate to the cache whether 

1366 a composite file is being requested from the cache or a component 

1367 file. Without this the cache will default to the supplied ref but 

1368 it can get confused with read-only derived components for 

1369 disassembled composites. 

1370 

1371 Returns 

1372 ------- 

1373 inMemoryDataset : `object` 

1374 The artifact as a python object. 

1375 """ 

1376 location = getInfo.location 

1377 uri = location.uri 

1378 log.debug("Accessing data from %s", uri) 

1379 

1380 if cache_ref is None: 

1381 cache_ref = ref 

1382 if cache_ref.id != ref.id: 

1383 raise ValueError( 

1384 "The supplied cache dataset ref refers to a different dataset than expected:" 

1385 f" {ref.id} != {cache_ref.id}" 

1386 ) 

1387 

1388 # Cannot recalculate checksum but can compare size as a quick check 

1389 # Do not do this if the size is negative since that indicates 

1390 # we do not know. 

1391 recorded_size = getInfo.info.file_size 

1392 resource_size = uri.size() 

1393 if recorded_size >= 0 and resource_size != recorded_size: 

1394 raise RuntimeError( 

1395 "Integrity failure in Datastore. " 

1396 f"Size of file {uri} ({resource_size}) " 

1397 f"does not match size recorded in registry of {recorded_size}" 

1398 ) 

1399 

1400 # For the general case we have choices for how to proceed. 

1401 # 1. Always use a local file (downloading the remote resource to a 

1402 # temporary file if needed). 

1403 # 2. Use a threshold size and read into memory and use bytes. 

1404 # Use both for now with an arbitrary hand off size. 

1405 # This allows small datasets to be downloaded from remote object 

1406 # stores without requiring a temporary file. 

1407 

1408 formatter = getInfo.formatter 

1409 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1410 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1411 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1412 if cached_file is not None: 

1413 desired_uri = cached_file 

1414 msg = f" (cached version of {uri})" 

1415 else: 

1416 desired_uri = uri 

1417 msg = "" 

1418 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1419 serializedDataset = desired_uri.read() 

1420 log.debug( 

1421 "Deserializing %s from %d bytes from location %s with formatter %s", 

1422 f"component {getInfo.component}" if isComponent else "", 

1423 len(serializedDataset), 

1424 uri, 

1425 formatter.name(), 

1426 ) 

1427 try: 

1428 result = formatter.fromBytes( 

1429 serializedDataset, component=getInfo.component if isComponent else None 

1430 ) 

1431 except Exception as e: 

1432 raise ValueError( 

1433 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1434 f" ({ref.datasetType.name} from {uri}): {e}" 

1435 ) from e 

1436 else: 

1437 # Read from file. 

1438 

1439 # Have to update the Location associated with the formatter 

1440 # because formatter.read does not allow an override. 

1441 # This could be improved. 

1442 location_updated = False 

1443 msg = "" 

1444 

1445 # First check in cache for local version. 

1446 # The cache will only be relevant for remote resources but 

1447 # no harm in always asking. Context manager ensures that cache 

1448 # file is not deleted during cache expiration. 

1449 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1450 if cached_file is not None: 

1451 msg = f"(via cache read of remote file {uri})" 

1452 uri = cached_file 

1453 location_updated = True 

1454 

1455 with uri.as_local() as local_uri: 

1456 can_be_cached = False 

1457 if uri != local_uri: 

1458 # URI was remote and file was downloaded 

1459 cache_msg = "" 

1460 location_updated = True 

1461 

1462 if self.cacheManager.should_be_cached(cache_ref): 

1463 # In this scenario we want to ask if the downloaded 

1464 # file should be cached but we should not cache 

1465 # it until after we've used it (to ensure it can't 

1466 # be expired whilst we are using it). 

1467 can_be_cached = True 

1468 

1469 # Say that it is "likely" to be cached because 

1470 # if the formatter read fails we will not be 

1471 # caching this file. 

1472 cache_msg = " and likely cached" 

1473 

1474 msg = f"(via download to local file{cache_msg})" 

1475 

1476 # Calculate the (possibly) new location for the formatter 

1477 # to use. 

1478 newLocation = Location(*local_uri.split()) if location_updated else None 

1479 

1480 log.debug( 

1481 "Reading%s from location %s %s with formatter %s", 

1482 f" component {getInfo.component}" if isComponent else "", 

1483 uri, 

1484 msg, 

1485 formatter.name(), 

1486 ) 

1487 try: 

1488 with ( 

1489 formatter._updateLocation(newLocation), 

1490 time_this( 

1491 log, 

1492 msg="Reading%s from location %s %s with formatter %s", 

1493 args=( 

1494 f" component {getInfo.component}" if isComponent else "", 

1495 uri, 

1496 msg, 

1497 formatter.name(), 

1498 ), 

1499 ), 

1500 ): 

1501 result = formatter.read(component=getInfo.component if isComponent else None) 

1502 except Exception as e: 

1503 raise ValueError( 

1504 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1505 f" ({ref.datasetType.name} from {uri}): {e}" 

1506 ) from e 

1507 

1508 # File was read successfully so can move to cache 

1509 if can_be_cached: 

1510 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1511 

1512 return self._post_process_get( 

1513 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1514 ) 

1515 

1516 def knows(self, ref: DatasetRef) -> bool: 

1517 """Check if the dataset is known to the datastore. 

1518 

1519 Does not check for existence of any artifact. 

1520 

1521 Parameters 

1522 ---------- 

1523 ref : `DatasetRef` 

1524 Reference to the required dataset. 

1525 

1526 Returns 

1527 ------- 

1528 exists : `bool` 

1529 `True` if the dataset is known to the datastore. 

1530 """ 

1531 # We cannot trust datastore records from ref, as many unit tests delete 

1532 # datasets and check their existence. 

1533 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1534 if fileLocations: 

1535 return True 

1536 return False 

1537 

1538 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1539 # Docstring inherited from the base class. 

1540 

1541 # The records themselves. Could be missing some entries. 

1542 records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

1543 

1544 return {ref: ref.id in records for ref in refs} 

1545 

1546 def _process_mexists_records( 

1547 self, 

1548 id_to_ref: dict[DatasetId, DatasetRef], 

1549 records: dict[DatasetId, list[StoredFileInfo]], 

1550 all_required: bool, 

1551 artifact_existence: dict[ResourcePath, bool] | None = None, 

1552 ) -> dict[DatasetRef, bool]: 

1553 """Check given records for existence. 

1554 

1555 Helper function for `mexists()`. 

1556 

1557 Parameters 

1558 ---------- 

1559 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1560 Mapping of the dataset ID to the dataset ref itself. 

1561 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1562 Records as generally returned by 

1563 ``_get_stored_records_associated_with_refs``. 

1564 all_required : `bool` 

1565 Flag to indicate whether existence requires all artifacts 

1566 associated with a dataset ID to exist or not for existence. 

1567 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1568 Optional mapping of datastore artifact to existence. Updated by 

1569 this method with details of all artifacts tested. Can be `None` 

1570 if the caller is not interested. 

1571 

1572 Returns 

1573 ------- 

1574 existence : `dict` of [`DatasetRef`, `bool`] 

1575 Mapping from dataset to boolean indicating existence. 

1576 """ 

1577 # The URIs to be checked and a mapping of those URIs to 

1578 # the dataset ID. 

1579 uris_to_check: list[ResourcePath] = [] 

1580 location_map: dict[ResourcePath, DatasetId] = {} 

1581 

1582 location_factory = self.locationFactory 

1583 

1584 uri_existence: dict[ResourcePath, bool] = {} 

1585 for ref_id, infos in records.items(): 

1586 # Key is the dataset Id, value is list of StoredItemInfo 

1587 uris = [info.file_location(location_factory).uri for info in infos] 

1588 location_map.update({uri: ref_id for uri in uris}) 

1589 

1590 # Check the local cache directly for a dataset corresponding 

1591 # to the remote URI. 

1592 if self.cacheManager.file_count > 0: 

1593 ref = id_to_ref[ref_id] 

1594 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1595 check_ref = ref 

1596 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1597 check_ref = ref.makeComponentRef(component) 

1598 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1599 # Proxy for URI existence. 

1600 uri_existence[uri] = True 

1601 else: 

1602 uris_to_check.append(uri) 

1603 else: 

1604 # Check all of them. 

1605 uris_to_check.extend(uris) 

1606 

1607 if artifact_existence is not None: 

1608 # If a URI has already been checked remove it from the list 

1609 # and immediately add the status to the output dict. 

1610 filtered_uris_to_check = [] 

1611 for uri in uris_to_check: 

1612 if uri in artifact_existence: 

1613 uri_existence[uri] = artifact_existence[uri] 

1614 else: 

1615 filtered_uris_to_check.append(uri) 

1616 uris_to_check = filtered_uris_to_check 

1617 

1618 # Results. 

1619 dataset_existence: dict[DatasetRef, bool] = {} 

1620 

1621 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1622 for uri, exists in uri_existence.items(): 

1623 dataset_id = location_map[uri] 

1624 ref = id_to_ref[dataset_id] 

1625 

1626 # Disassembled composite needs to check all locations. 

1627 # all_required indicates whether all need to exist or not. 

1628 if ref in dataset_existence: 

1629 if all_required: 

1630 exists = dataset_existence[ref] and exists 

1631 else: 

1632 exists = dataset_existence[ref] or exists 

1633 dataset_existence[ref] = exists 

1634 

1635 if artifact_existence is not None: 

1636 artifact_existence.update(uri_existence) 

1637 

1638 return dataset_existence 

1639 

1640 def mexists( 

1641 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1642 ) -> dict[DatasetRef, bool]: 

1643 """Check the existence of multiple datasets at once. 

1644 

1645 Parameters 

1646 ---------- 

1647 refs : iterable of `DatasetRef` 

1648 The datasets to be checked. 

1649 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1650 Optional mapping of datastore artifact to existence. Updated by 

1651 this method with details of all artifacts tested. Can be `None` 

1652 if the caller is not interested. 

1653 

1654 Returns 

1655 ------- 

1656 existence : `dict` of [`DatasetRef`, `bool`] 

1657 Mapping from dataset to boolean indicating existence. 

1658 

1659 Notes 

1660 ----- 

1661 To minimize potentially costly remote existence checks, the local 

1662 cache is checked as a proxy for existence. If a file for this 

1663 `DatasetRef` does exist no check is done for the actual URI. This 

1664 could result in possibly unexpected behavior if the dataset itself 

1665 has been removed from the datastore by another process whilst it is 

1666 still in the cache. 

1667 """ 

1668 chunk_size = 10_000 

1669 dataset_existence: dict[DatasetRef, bool] = {} 

1670 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1671 n_found_total = 0 

1672 n_checked = 0 

1673 n_chunks = 0 

1674 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1675 chunk_result = self._mexists(chunk, artifact_existence) 

1676 

1677 # The log message level and content depend on how many 

1678 # datasets we are processing. 

1679 n_results = len(chunk_result) 

1680 

1681 # Use verbose logging to ensure that messages can be seen 

1682 # easily if many refs are being checked. 

1683 log_threshold = VERBOSE 

1684 n_checked += n_results 

1685 

1686 # This sum can take some time so only do it if we know the 

1687 # result is going to be used. 

1688 n_found = 0 

1689 if log.isEnabledFor(log_threshold): 

1690 # Can treat the booleans as 0, 1 integers and sum them. 

1691 n_found = sum(chunk_result.values()) 

1692 n_found_total += n_found 

1693 

1694 # We are deliberately not trying to count the number of refs 

1695 # provided in case it's in the millions. This means there is a 

1696 # situation where the number of refs exactly matches the chunk 

1697 # size and we will switch to the multi-chunk path even though 

1698 # we only have a single chunk. 

1699 if n_results < chunk_size and n_chunks == 0: 

1700 # Single chunk will be processed so we can provide more detail. 

1701 if n_results == 1: 

1702 ref = list(chunk_result)[0] 

1703 # Use debug logging to be consistent with `exists()`. 

1704 log.debug( 

1705 "Calling mexists() with single ref that does%s exist (%s).", 

1706 "" if chunk_result[ref] else " not", 

1707 ref, 

1708 ) 

1709 else: 

1710 # Single chunk but multiple files. Summarize. 

1711 log.log( 

1712 log_threshold, 

1713 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1714 n_found, 

1715 n_checked, 

1716 ) 

1717 

1718 else: 

1719 # Use incremental verbose logging when we have multiple chunks. 

1720 log.log( 

1721 log_threshold, 

1722 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1723 "(running total from all chunks so far: %d found out of %d checked)", 

1724 n_chunks, 

1725 n_found, 

1726 n_results, 

1727 n_found_total, 

1728 n_checked, 

1729 ) 

1730 dataset_existence.update(chunk_result) 

1731 n_chunks += 1 

1732 

1733 return dataset_existence 

1734 

1735 def _mexists( 

1736 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1737 ) -> dict[DatasetRef, bool]: 

1738 """Check the existence of multiple datasets at once. 

1739 

1740 Parameters 

1741 ---------- 

1742 refs : iterable of `DatasetRef` 

1743 The datasets to be checked. 

1744 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1745 Optional mapping of datastore artifact to existence. Updated by 

1746 this method with details of all artifacts tested. Can be `None` 

1747 if the caller is not interested. 

1748 

1749 Returns 

1750 ------- 

1751 existence : `dict` of [`DatasetRef`, `bool`] 

1752 Mapping from dataset to boolean indicating existence. 

1753 """ 

1754 # Make a mapping from refs with the internal storage class to the given 

1755 # refs that may have a different one. We'll use the internal refs 

1756 # throughout this method and convert back at the very end. 

1757 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1758 

1759 # Need a mapping of dataset_id to (internal) dataset ref since some 

1760 # internal APIs work with dataset_id. 

1761 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1762 

1763 # Set of all IDs we are checking for. 

1764 requested_ids = set(id_to_ref.keys()) 

1765 

1766 # The records themselves. Could be missing some entries. 

1767 records = self._get_stored_records_associated_with_refs( 

1768 id_to_ref.values(), ignore_datastore_records=True 

1769 ) 

1770 

1771 dataset_existence = self._process_mexists_records( 

1772 id_to_ref, records, True, artifact_existence=artifact_existence 

1773 ) 

1774 

1775 # Set of IDs that have been handled. 

1776 handled_ids = {ref.id for ref in dataset_existence} 

1777 

1778 missing_ids = requested_ids - handled_ids 

1779 if missing_ids: 

1780 dataset_existence.update( 

1781 self._mexists_check_expected( 

1782 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1783 ) 

1784 ) 

1785 

1786 return { 

1787 internal_ref_to_input_ref[internal_ref]: existence 

1788 for internal_ref, existence in dataset_existence.items() 

1789 } 

1790 

1791 def _mexists_check_expected( 

1792 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1793 ) -> dict[DatasetRef, bool]: 

1794 """Check existence of refs that are not known to datastore. 

1795 

1796 Parameters 

1797 ---------- 

1798 refs : iterable of `DatasetRef` 

1799 The datasets to be checked. These are assumed not to be known 

1800 to datastore. 

1801 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1802 Optional mapping of datastore artifact to existence. Updated by 

1803 this method with details of all artifacts tested. Can be `None` 

1804 if the caller is not interested. 

1805 

1806 Returns 

1807 ------- 

1808 existence : `dict` of [`DatasetRef`, `bool`] 

1809 Mapping from dataset to boolean indicating existence. 

1810 """ 

1811 dataset_existence: dict[DatasetRef, bool] = {} 

1812 if not self.trustGetRequest: 

1813 # Must assume these do not exist 

1814 for ref in refs: 

1815 dataset_existence[ref] = False 

1816 else: 

1817 log.debug( 

1818 "%d datasets were not known to datastore during initial existence check.", 

1819 len(refs), 

1820 ) 

1821 

1822 # Construct data structure identical to that returned 

1823 # by _get_stored_records_associated_with_refs() but using 

1824 # guessed names. 

1825 records = {} 

1826 id_to_ref = {} 

1827 for missing_ref in refs: 

1828 expected = self._get_expected_dataset_locations_info(missing_ref) 

1829 dataset_id = missing_ref.id 

1830 records[dataset_id] = [info for _, info in expected] 

1831 id_to_ref[dataset_id] = missing_ref 

1832 

1833 dataset_existence.update( 

1834 self._process_mexists_records( 

1835 id_to_ref, 

1836 records, 

1837 False, 

1838 artifact_existence=artifact_existence, 

1839 ) 

1840 ) 

1841 

1842 return dataset_existence 

1843 

1844 def exists(self, ref: DatasetRef) -> bool: 

1845 """Check if the dataset exists in the datastore. 

1846 

1847 Parameters 

1848 ---------- 

1849 ref : `DatasetRef` 

1850 Reference to the required dataset. 

1851 

1852 Returns 

1853 ------- 

1854 exists : `bool` 

1855 `True` if the entity exists in the `Datastore`. 

1856 

1857 Notes 

1858 ----- 

1859 The local cache is checked as a proxy for existence in the remote 

1860 object store. It is possible that another process on a different 

1861 compute node could remove the file from the object store even 

1862 though it is present in the local cache. 

1863 """ 

1864 ref = self._cast_storage_class(ref) 

1865 # We cannot trust datastore records from ref, as many unit tests delete 

1866 # datasets and check their existence. 

1867 fileLocations = self._get_dataset_locations_info(ref, ignore_datastore_records=True) 

1868 

1869 # if we are being asked to trust that registry might not be correct 

1870 # we ask for the expected locations and check them explicitly 

1871 if not fileLocations: 

1872 if not self.trustGetRequest: 

1873 return False 

1874 

1875 # First check the cache. If it is not found we must check 

1876 # the datastore itself. Assume that any component in the cache 

1877 # means that the dataset does exist somewhere. 

1878 if self.cacheManager.known_to_cache(ref): 

1879 return True 

1880 

1881 # When we are guessing a dataset location we can not check 

1882 # for the existence of every component since we can not 

1883 # know if every component was written. Instead we check 

1884 # for the existence of any of the expected locations. 

1885 for location, _ in self._get_expected_dataset_locations_info(ref): 

1886 if self._artifact_exists(location): 

1887 return True 

1888 return False 

1889 

1890 # All listed artifacts must exist. 

1891 for location, storedFileInfo in fileLocations: 

1892 # Checking in cache needs the component ref. 

1893 check_ref = ref 

1894 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1895 check_ref = ref.makeComponentRef(component) 

1896 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1897 continue 

1898 

1899 if not self._artifact_exists(location): 

1900 return False 

1901 

1902 return True 

1903 

1904 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1905 """Return URIs associated with dataset. 

1906 

1907 Parameters 

1908 ---------- 

1909 ref : `DatasetRef` 

1910 Reference to the required dataset. 

1911 predict : `bool`, optional 

1912 If the datastore does not know about the dataset, should it 

1913 return a predicted URI or not? 

1914 

1915 Returns 

1916 ------- 

1917 uris : `DatasetRefURIs` 

1918 The URI to the primary artifact associated with this dataset (if 

1919 the dataset was disassembled within the datastore this may be 

1920 `None`), and the URIs to any components associated with the dataset 

1921 artifact. (can be empty if there are no components). 

1922 """ 

1923 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1924 return many[ref] 

1925 

1926 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1927 """URI to the Dataset. 

1928 

1929 Parameters 

1930 ---------- 

1931 ref : `DatasetRef` 

1932 Reference to the required Dataset. 

1933 predict : `bool` 

1934 If `True`, allow URIs to be returned of datasets that have not 

1935 been written. 

1936 

1937 Returns 

1938 ------- 

1939 uri : `str` 

1940 URI pointing to the dataset within the datastore. If the 

1941 dataset does not exist in the datastore, and if ``predict`` is 

1942 `True`, the URI will be a prediction and will include a URI 

1943 fragment "#predicted". 

1944 If the datastore does not have entities that relate well 

1945 to the concept of a URI the returned URI will be 

1946 descriptive. The returned URI is not guaranteed to be obtainable. 

1947 

1948 Raises 

1949 ------ 

1950 FileNotFoundError 

1951 Raised if a URI has been requested for a dataset that does not 

1952 exist and guessing is not allowed. 

1953 RuntimeError 

1954 Raised if a request is made for a single URI but multiple URIs 

1955 are associated with this dataset. 

1956 

1957 Notes 

1958 ----- 

1959 When a predicted URI is requested an attempt will be made to form 

1960 a reasonable URI based on file templates and the expected formatter. 

1961 """ 

1962 primary, components = self.getURIs(ref, predict) 

1963 if primary is None or components: 

1964 raise RuntimeError( 

1965 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1966 ) 

1967 return primary 

1968 

1969 def _predict_URIs( 

1970 self, 

1971 ref: DatasetRef, 

1972 ) -> DatasetRefURIs: 

1973 """Predict the URIs of a dataset ref. 

1974 

1975 Parameters 

1976 ---------- 

1977 ref : `DatasetRef` 

1978 Reference to the required Dataset. 

1979 

1980 Returns 

1981 ------- 

1982 URI : DatasetRefUris 

1983 Primary and component URIs. URIs will contain a URI fragment 

1984 "#predicted". 

1985 """ 

1986 uris = DatasetRefURIs() 

1987 

1988 if self.composites.shouldBeDisassembled(ref): 

1989 for component, _ in ref.datasetType.storageClass.components.items(): 

1990 comp_ref = ref.makeComponentRef(component) 

1991 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1992 

1993 # Add the "#predicted" URI fragment to indicate this is a 

1994 # guess 

1995 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1996 

1997 else: 

1998 location, _ = self._determine_put_formatter_location(ref) 

1999 

2000 # Add the "#predicted" URI fragment to indicate this is a guess 

2001 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

2002 

2003 return uris 

2004 

2005 def getManyURIs( 

2006 self, 

2007 refs: Iterable[DatasetRef], 

2008 predict: bool = False, 

2009 allow_missing: bool = False, 

2010 ) -> dict[DatasetRef, DatasetRefURIs]: 

2011 # Docstring inherited 

2012 

2013 uris: dict[DatasetRef, DatasetRefURIs] = {} 

2014 

2015 records = self._get_stored_records_associated_with_refs(refs) 

2016 records_keys = records.keys() 

2017 

2018 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

2019 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

2020 

2021 # Have to handle trustGetRequest mode by checking for the existence 

2022 # of the missing refs on disk. 

2023 if missing_refs: 

2024 dataset_existence = self._mexists_check_expected(missing_refs, None) 

2025 really_missing = set() 

2026 not_missing = set() 

2027 for ref, exists in dataset_existence.items(): 

2028 if exists: 

2029 not_missing.add(ref) 

2030 else: 

2031 really_missing.add(ref) 

2032 

2033 if not_missing: 

2034 # Need to recalculate the missing/existing split. 

2035 existing_refs = existing_refs + tuple(not_missing) 

2036 missing_refs = tuple(really_missing) 

2037 

2038 for ref in missing_refs: 

2039 # if this has never been written then we have to guess 

2040 if not predict: 

2041 if not allow_missing: 

2042 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

2043 else: 

2044 uris[ref] = self._predict_URIs(ref) 

2045 

2046 for ref in existing_refs: 

2047 file_infos = records[ref.id] 

2048 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

2049 uris[ref] = self._locations_to_URI(ref, file_locations) 

2050 

2051 return uris 

2052 

2053 def _locations_to_URI( 

2054 self, 

2055 ref: DatasetRef, 

2056 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

2057 ) -> DatasetRefURIs: 

2058 """Convert one or more file locations associated with a DatasetRef 

2059 to a DatasetRefURIs. 

2060 

2061 Parameters 

2062 ---------- 

2063 ref : `DatasetRef` 

2064 Reference to the dataset. 

2065 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

2066 Each item in the sequence is the location of the dataset within the 

2067 datastore and stored information about the file and its formatter. 

2068 If there is only one item in the sequence then it is treated as the 

2069 primary URI. If there is more than one item then they are treated 

2070 as component URIs. If there are no items then an error is raised 

2071 unless ``self.trustGetRequest`` is `True`. 

2072 

2073 Returns 

2074 ------- 

2075 uris: DatasetRefURIs 

2076 Represents the primary URI or component URIs described by the 

2077 inputs. 

2078 

2079 Raises 

2080 ------ 

2081 RuntimeError 

2082 If no file locations are passed in and ``self.trustGetRequest`` is 

2083 `False`. 

2084 FileNotFoundError 

2085 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

2086 is `False`. 

2087 RuntimeError 

2088 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

2089 unexpected). 

2090 """ 

2091 guessing = False 

2092 uris = DatasetRefURIs() 

2093 

2094 if not file_locations: 

2095 if not self.trustGetRequest: 

2096 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

2097 file_locations = self._get_expected_dataset_locations_info(ref) 

2098 guessing = True 

2099 

2100 if len(file_locations) == 1: 

2101 # No disassembly so this is the primary URI 

2102 uris.primaryURI = file_locations[0][0].uri 

2103 if guessing and not uris.primaryURI.exists(): 

2104 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

2105 else: 

2106 for location, file_info in file_locations: 

2107 if file_info.component is None: 

2108 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

2109 if guessing and not location.uri.exists(): 

2110 # If we are trusting then it is entirely possible for 

2111 # some components to be missing. In that case we skip 

2112 # to the next component. 

2113 if self.trustGetRequest: 

2114 continue 

2115 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

2116 uris.componentURIs[file_info.component] = location.uri 

2117 

2118 return uris 

2119 

2120 def retrieveArtifacts( 

2121 self, 

2122 refs: Iterable[DatasetRef], 

2123 destination: ResourcePath, 

2124 transfer: str = "auto", 

2125 preserve_path: bool = True, 

2126 overwrite: bool = False, 

2127 ) -> list[ResourcePath]: 

2128 """Retrieve the file artifacts associated with the supplied refs. 

2129 

2130 Parameters 

2131 ---------- 

2132 refs : iterable of `DatasetRef` 

2133 The datasets for which file artifacts are to be retrieved. 

2134 A single ref can result in multiple files. The refs must 

2135 be resolved. 

2136 destination : `lsst.resources.ResourcePath` 

2137 Location to write the file artifacts. 

2138 transfer : `str`, optional 

2139 Method to use to transfer the artifacts. Must be one of the options 

2140 supported by `lsst.resources.ResourcePath.transfer_from()`. 

2141 "move" is not allowed. 

2142 preserve_path : `bool`, optional 

2143 If `True` the full path of the file artifact within the datastore 

2144 is preserved. If `False` the final file component of the path 

2145 is used. 

2146 overwrite : `bool`, optional 

2147 If `True` allow transfers to overwrite existing files at the 

2148 destination. 

2149 

2150 Returns 

2151 ------- 

2152 targets : `list` of `lsst.resources.ResourcePath` 

2153 URIs of file artifacts in destination location. Order is not 

2154 preserved. 

2155 """ 

2156 if not destination.isdir(): 

2157 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

2158 

2159 if transfer == "move": 

2160 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

2161 

2162 # Source -> Destination 

2163 # This also helps filter out duplicate DatasetRef in the request 

2164 # that will map to the same underlying file transfer. 

2165 to_transfer: dict[ResourcePath, ResourcePath] = {} 

2166 

2167 for ref in refs: 

2168 locations = self._get_dataset_locations_info(ref) 

2169 for location, _ in locations: 

2170 source_uri = location.uri 

2171 target_path: ResourcePathExpression 

2172 if preserve_path: 

2173 target_path = location.pathInStore 

2174 if target_path.isabs(): 

2175 # This is an absolute path to an external file. 

2176 # Use the full path. 

2177 target_path = target_path.relativeToPathRoot 

2178 else: 

2179 target_path = source_uri.basename() 

2180 target_uri = destination.join(target_path) 

2181 to_transfer[source_uri] = target_uri 

2182 

2183 # In theory can now parallelize the transfer 

2184 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

2185 for source_uri, target_uri in to_transfer.items(): 

2186 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

2187 

2188 return list(to_transfer.values()) 

2189 

2190 def get( 

2191 self, 

2192 ref: DatasetRef, 

2193 parameters: Mapping[str, Any] | None = None, 

2194 storageClass: StorageClass | str | None = None, 

2195 ) -> Any: 

2196 """Load an InMemoryDataset from the store. 

2197 

2198 Parameters 

2199 ---------- 

2200 ref : `DatasetRef` 

2201 Reference to the required Dataset. 

2202 parameters : `dict` 

2203 `StorageClass`-specific parameters that specify, for example, 

2204 a slice of the dataset to be loaded. 

2205 storageClass : `StorageClass` or `str`, optional 

2206 The storage class to be used to override the Python type 

2207 returned by this method. By default the returned type matches 

2208 the dataset type definition for this dataset. Specifying a 

2209 read `StorageClass` can force a different type to be returned. 

2210 This type must be compatible with the original type. 

2211 

2212 Returns 

2213 ------- 

2214 inMemoryDataset : `object` 

2215 Requested dataset or slice thereof as an InMemoryDataset. 

2216 

2217 Raises 

2218 ------ 

2219 FileNotFoundError 

2220 Requested dataset can not be retrieved. 

2221 TypeError 

2222 Return value from formatter has unexpected type. 

2223 ValueError 

2224 Formatter failed to process the dataset. 

2225 """ 

2226 # Supplied storage class for the component being read is either 

2227 # from the ref itself or some an override if we want to force 

2228 # type conversion. 

2229 if storageClass is not None: 

2230 ref = ref.overrideStorageClass(storageClass) 

2231 refStorageClass = ref.datasetType.storageClass 

2232 

2233 allGetInfo = self._prepare_for_get(ref, parameters) 

2234 refComponent = ref.datasetType.component() 

2235 

2236 # Create mapping from component name to related info 

2237 allComponents = {i.component: i for i in allGetInfo} 

2238 

2239 # By definition the dataset is disassembled if we have more 

2240 # than one record for it. 

2241 isDisassembled = len(allGetInfo) > 1 

2242 

2243 # Look for the special case where we are disassembled but the 

2244 # component is a derived component that was not written during 

2245 # disassembly. For this scenario we need to check that the 

2246 # component requested is listed as a derived component for the 

2247 # composite storage class 

2248 isDisassembledReadOnlyComponent = False 

2249 if isDisassembled and refComponent: 

2250 # The composite storage class should be accessible through 

2251 # the component dataset type 

2252 compositeStorageClass = ref.datasetType.parentStorageClass 

2253 

2254 # In the unlikely scenario where the composite storage 

2255 # class is not known, we can only assume that this is a 

2256 # normal component. If that assumption is wrong then the 

2257 # branch below that reads a persisted component will fail 

2258 # so there is no need to complain here. 

2259 if compositeStorageClass is not None: 

2260 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2261 

2262 if isDisassembled and not refComponent: 

2263 # This was a disassembled dataset spread over multiple files 

2264 # and we need to put them all back together again. 

2265 # Read into memory and then assemble 

2266 

2267 # Check that the supplied parameters are suitable for the type read 

2268 refStorageClass.validateParameters(parameters) 

2269 

2270 # We want to keep track of all the parameters that were not used 

2271 # by formatters. We assume that if any of the component formatters 

2272 # use a parameter that we do not need to apply it again in the 

2273 # assembler. 

2274 usedParams = set() 

2275 

2276 components: dict[str, Any] = {} 

2277 for getInfo in allGetInfo: 

2278 # assemblerParams are parameters not understood by the 

2279 # associated formatter. 

2280 usedParams.update(set(getInfo.formatterParams)) 

2281 

2282 component = getInfo.component 

2283 

2284 if component is None: 

2285 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2286 

2287 # We do not want the formatter to think it's reading 

2288 # a component though because it is really reading a 

2289 # standalone dataset -- always tell reader it is not a 

2290 # component. 

2291 components[component] = self._read_artifact_into_memory( 

2292 getInfo, ref.makeComponentRef(component), isComponent=False 

2293 ) 

2294 

2295 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2296 

2297 # Any unused parameters will have to be passed to the assembler 

2298 if parameters: 

2299 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2300 else: 

2301 unusedParams = {} 

2302 

2303 # Process parameters 

2304 return ref.datasetType.storageClass.delegate().handleParameters( 

2305 inMemoryDataset, parameters=unusedParams 

2306 ) 

2307 

2308 elif isDisassembledReadOnlyComponent: 

2309 compositeStorageClass = ref.datasetType.parentStorageClass 

2310 if compositeStorageClass is None: 

2311 raise RuntimeError( 

2312 f"Unable to retrieve derived component '{refComponent}' since" 

2313 "no composite storage class is available." 

2314 ) 

2315 

2316 if refComponent is None: 

2317 # Mainly for mypy 

2318 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2319 

2320 # Assume that every derived component can be calculated by 

2321 # forwarding the request to a single read/write component. 

2322 # Rather than guessing which rw component is the right one by 

2323 # scanning each for a derived component of the same name, 

2324 # we ask the storage class delegate directly which one is best to 

2325 # use. 

2326 compositeDelegate = compositeStorageClass.delegate() 

2327 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2328 refComponent, set(allComponents) 

2329 ) 

2330 

2331 # Select the relevant component 

2332 rwInfo = allComponents[forwardedComponent] 

2333 

2334 # For now assume that read parameters are validated against 

2335 # the real component and not the requested component 

2336 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2337 forwardedStorageClass.validateParameters(parameters) 

2338 

2339 # The reference to use for the caching must refer to the forwarded 

2340 # component and not the derived component. 

2341 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2342 

2343 # Unfortunately the FileDescriptor inside the formatter will have 

2344 # the wrong write storage class so we need to create a new one 

2345 # given the immutability constraint. 

2346 writeStorageClass = rwInfo.info.storageClass 

2347 

2348 # We may need to put some thought into parameters for read 

2349 # components but for now forward them on as is 

2350 readFormatter = type(rwInfo.formatter)( 

2351 FileDescriptor( 

2352 rwInfo.location, 

2353 readStorageClass=refStorageClass, 

2354 storageClass=writeStorageClass, 

2355 parameters=parameters, 

2356 ), 

2357 ref.dataId, 

2358 ) 

2359 

2360 # The assembler can not receive any parameter requests for a 

2361 # derived component at this time since the assembler will 

2362 # see the storage class of the derived component and those 

2363 # parameters will have to be handled by the formatter on the 

2364 # forwarded storage class. 

2365 assemblerParams: dict[str, Any] = {} 

2366 

2367 # Need to created a new info that specifies the derived 

2368 # component and associated storage class 

2369 readInfo = DatastoreFileGetInformation( 

2370 rwInfo.location, 

2371 readFormatter, 

2372 rwInfo.info, 

2373 assemblerParams, 

2374 {}, 

2375 refComponent, 

2376 refStorageClass, 

2377 ) 

2378 

2379 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2380 

2381 else: 

2382 # Single file request or component from that composite file 

2383 for lookup in (refComponent, None): 

2384 if lookup in allComponents: 

2385 getInfo = allComponents[lookup] 

2386 break 

2387 else: 

2388 raise FileNotFoundError( 

2389 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2390 ) 

2391 

2392 # Do not need the component itself if already disassembled 

2393 if isDisassembled: 

2394 isComponent = False 

2395 else: 

2396 isComponent = getInfo.component is not None 

2397 

2398 # For a component read of a composite we want the cache to 

2399 # be looking at the composite ref itself. 

2400 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2401 

2402 # For a disassembled component we can validate parameters against 

2403 # the component storage class directly 

2404 if isDisassembled: 

2405 refStorageClass.validateParameters(parameters) 

2406 else: 

2407 # For an assembled composite this could be a derived 

2408 # component derived from a real component. The validity 

2409 # of the parameters is not clear. For now validate against 

2410 # the composite storage class 

2411 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2412 

2413 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2414 

2415 @transactional 

2416 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2417 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2418 

2419 Parameters 

2420 ---------- 

2421 inMemoryDataset : `object` 

2422 The dataset to store. 

2423 ref : `DatasetRef` 

2424 Reference to the associated Dataset. 

2425 

2426 Raises 

2427 ------ 

2428 TypeError 

2429 Supplied object and storage class are inconsistent. 

2430 DatasetTypeNotSupportedError 

2431 The associated `DatasetType` is not handled by this datastore. 

2432 

2433 Notes 

2434 ----- 

2435 If the datastore is configured to reject certain dataset types it 

2436 is possible that the put will fail and raise a 

2437 `DatasetTypeNotSupportedError`. The main use case for this is to 

2438 allow `ChainedDatastore` to put to multiple datastores without 

2439 requiring that every datastore accepts the dataset. 

2440 """ 

2441 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2442 # doDisassembly = True 

2443 

2444 artifacts = [] 

2445 if doDisassembly: 

2446 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2447 if components is None: 

2448 raise RuntimeError( 

2449 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2450 f"with storage class {ref.datasetType.storageClass.name} " 

2451 "is configured to be disassembled, but cannot be." 

2452 ) 

2453 for component, componentInfo in components.items(): 

2454 # Don't recurse because we want to take advantage of 

2455 # bulk insert -- need a new DatasetRef that refers to the 

2456 # same dataset_id but has the component DatasetType 

2457 # DatasetType does not refer to the types of components 

2458 # So we construct one ourselves. 

2459 compRef = ref.makeComponentRef(component) 

2460 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2461 artifacts.append((compRef, storedInfo)) 

2462 else: 

2463 # Write the entire thing out 

2464 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2465 artifacts.append((ref, storedInfo)) 

2466 

2467 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) 

2468 

2469 @transactional 

2470 def put_new(self, inMemoryDataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: 

2471 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2472 # doDisassembly = True 

2473 

2474 artifacts = [] 

2475 if doDisassembly: 

2476 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2477 if components is None: 

2478 raise RuntimeError( 

2479 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2480 f"with storage class {ref.datasetType.storageClass.name} " 

2481 "is configured to be disassembled, but cannot be." 

2482 ) 

2483 for component, componentInfo in components.items(): 

2484 # Don't recurse because we want to take advantage of 

2485 # bulk insert -- need a new DatasetRef that refers to the 

2486 # same dataset_id but has the component DatasetType 

2487 # DatasetType does not refer to the types of components 

2488 # So we construct one ourselves. 

2489 compRef = ref.makeComponentRef(component) 

2490 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2491 artifacts.append((compRef, storedInfo)) 

2492 else: 

2493 # Write the entire thing out 

2494 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2495 artifacts.append((ref, storedInfo)) 

2496 

2497 ref_records = {self._opaque_table_name: [info for _, info in artifacts]} 

2498 ref = ref.replace(datastore_records=ref_records) 

2499 return {self.name: ref} 

2500 

2501 @transactional 

2502 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2503 # At this point can safely remove these datasets from the cache 

2504 # to avoid confusion later on. If they are not trashed later 

2505 # the cache will simply be refilled. 

2506 self.cacheManager.remove_from_cache(ref) 

2507 

2508 # If we are in trust mode there will be nothing to move to 

2509 # the trash table and we will have to try to delete the file 

2510 # immediately. 

2511 if self.trustGetRequest: 

2512 # Try to keep the logic below for a single file trash. 

2513 if isinstance(ref, DatasetRef): 

2514 refs = {ref} 

2515 else: 

2516 # Will recreate ref at the end of this branch. 

2517 refs = set(ref) 

2518 

2519 # Determine which datasets are known to datastore directly. 

2520 id_to_ref = {ref.id: ref for ref in refs} 

2521 existing_ids = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2522 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2523 

2524 missing = refs - existing_refs 

2525 if missing: 

2526 # Do an explicit existence check on these refs. 

2527 # We only care about the artifacts at this point and not 

2528 # the dataset existence. 

2529 artifact_existence: dict[ResourcePath, bool] = {} 

2530 _ = self.mexists(missing, artifact_existence) 

2531 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2532 

2533 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2534 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2535 for uri in uris: 

2536 try: 

2537 uri.remove() 

2538 except Exception as e: 

2539 if ignore_errors: 

2540 log.debug("Artifact %s could not be removed: %s", uri, e) 

2541 continue 

2542 raise 

2543 

2544 # There is no point asking the code below to remove refs we 

2545 # know are missing so update it with the list of existing 

2546 # records. Try to retain one vs many logic. 

2547 if not existing_refs: 

2548 # Nothing more to do since none of the datasets were 

2549 # known to the datastore record table. 

2550 return 

2551 ref = list(existing_refs) 

2552 if len(ref) == 1: 

2553 ref = ref[0] 

2554 

2555 # Get file metadata and internal metadata 

2556 if not isinstance(ref, DatasetRef): 

2557 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2558 # Assumed to be an iterable of refs so bulk mode enabled. 

2559 try: 

2560 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2561 except Exception as e: 

2562 if ignore_errors: 

2563 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2564 else: 

2565 raise 

2566 return 

2567 

2568 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2569 

2570 fileLocations = self._get_dataset_locations_info(ref) 

2571 

2572 if not fileLocations: 

2573 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2574 if ignore_errors: 

2575 log.warning(err_msg) 

2576 return 

2577 else: 

2578 raise FileNotFoundError(err_msg) 

2579 

2580 for location, _ in fileLocations: 

2581 if not self._artifact_exists(location): 

2582 err_msg = ( 

2583 f"Dataset is known to datastore {self.name} but " 

2584 f"associated artifact ({location.uri}) is missing" 

2585 ) 

2586 if ignore_errors: 

2587 log.warning(err_msg) 

2588 return 

2589 else: 

2590 raise FileNotFoundError(err_msg) 

2591 

2592 # Mark dataset as trashed 

2593 try: 

2594 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2595 except Exception as e: 

2596 if ignore_errors: 

2597 log.warning( 

2598 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2599 "but encountered an error: %s", 

2600 ref, 

2601 self.name, 

2602 e, 

2603 ) 

2604 pass 

2605 else: 

2606 raise 

2607 

2608 @transactional 

2609 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2610 """Remove all datasets from the trash. 

2611 

2612 Parameters 

2613 ---------- 

2614 ignore_errors : `bool` 

2615 If `True` return without error even if something went wrong. 

2616 Problems could occur if another process is simultaneously trying 

2617 to delete. 

2618 """ 

2619 log.debug("Emptying trash in datastore %s", self.name) 

2620 

2621 # Context manager will empty trash iff we finish it without raising. 

2622 # It will also automatically delete the relevant rows from the 

2623 # trash table and the records table. 

2624 with self.bridge.emptyTrash( 

2625 self._table, record_class=StoredFileInfo, record_column="path" 

2626 ) as trash_data: 

2627 # Removing the artifacts themselves requires that the files are 

2628 # not also associated with refs that are not to be trashed. 

2629 # Therefore need to do a query with the file paths themselves 

2630 # and return all the refs associated with them. Can only delete 

2631 # a file if the refs to be trashed are the only refs associated 

2632 # with the file. 

2633 # This requires multiple copies of the trashed items 

2634 trashed, artifacts_to_keep = trash_data 

2635 

2636 if artifacts_to_keep is None: 

2637 # The bridge is not helping us so have to work it out 

2638 # ourselves. This is not going to be as efficient. 

2639 trashed = list(trashed) 

2640 

2641 # The instance check is for mypy since up to this point it 

2642 # does not know the type of info. 

2643 path_map = self._refs_associated_with_artifacts( 

2644 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2645 ) 

2646 

2647 for ref, info in trashed: 

2648 # Mypy needs to know this is not the base class 

2649 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2650 

2651 path_map[info.path].remove(ref.id) 

2652 if not path_map[info.path]: 

2653 del path_map[info.path] 

2654 

2655 artifacts_to_keep = set(path_map) 

2656 

2657 for ref, info in trashed: 

2658 # Should not happen for this implementation but need 

2659 # to keep mypy happy. 

2660 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2661 

2662 # Mypy needs to know this is not the base class 

2663 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2664 

2665 if info.path in artifacts_to_keep: 

2666 # This is a multi-dataset artifact and we are not 

2667 # removing all associated refs. 

2668 continue 

2669 

2670 # Only trashed refs still known to datastore will be returned. 

2671 location = info.file_location(self.locationFactory) 

2672 

2673 # Point of no return for this artifact 

2674 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2675 try: 

2676 self._delete_artifact(location) 

2677 except FileNotFoundError: 

2678 # If the file itself has been deleted there is nothing 

2679 # we can do about it. It is possible that trash has 

2680 # been run in parallel in another process or someone 

2681 # decided to delete the file. It is unlikely to come 

2682 # back and so we should still continue with the removal 

2683 # of the entry from the trash table. It is also possible 

2684 # we removed it in a previous iteration if it was 

2685 # a multi-dataset artifact. The delete artifact method 

2686 # will log a debug message in this scenario. 

2687 # Distinguishing file missing before trash started and 

2688 # file already removed previously as part of this trash 

2689 # is not worth the distinction with regards to potential 

2690 # memory cost. 

2691 pass 

2692 except Exception as e: 

2693 if ignore_errors: 

2694 # Use a debug message here even though it's not 

2695 # a good situation. In some cases this can be 

2696 # caused by a race between user A and user B 

2697 # and neither of them has permissions for the 

2698 # other's files. Butler does not know about users 

2699 # and trash has no idea what collections these 

2700 # files were in (without guessing from a path). 

2701 log.debug( 

2702 "Encountered error removing artifact %s from datastore %s: %s", 

2703 location.uri, 

2704 self.name, 

2705 e, 

2706 ) 

2707 else: 

2708 raise 

2709 

2710 @transactional 

2711 def transfer_from( 

2712 self, 

2713 source_datastore: Datastore, 

2714 refs: Iterable[DatasetRef], 

2715 transfer: str = "auto", 

2716 artifact_existence: dict[ResourcePath, bool] | None = None, 

2717 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2718 # Docstring inherited 

2719 if type(self) is not type(source_datastore): 

2720 raise TypeError( 

2721 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2722 f"source datastore ({type(source_datastore)})." 

2723 ) 

2724 

2725 # Be explicit for mypy 

2726 if not isinstance(source_datastore, FileDatastore): 

2727 raise TypeError( 

2728 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2729 f" {type(source_datastore)}" 

2730 ) 

2731 

2732 # Stop early if "direct" transfer mode is requested. That would 

2733 # require that the URI inside the source datastore should be stored 

2734 # directly in the target datastore, which seems unlikely to be useful 

2735 # since at any moment the source datastore could delete the file. 

2736 if transfer in ("direct", "split"): 

2737 raise ValueError( 

2738 f"Can not transfer from a source datastore using {transfer} mode since" 

2739 " those files are controlled by the other datastore." 

2740 ) 

2741 

2742 # Empty existence lookup if none given. 

2743 if artifact_existence is None: 

2744 artifact_existence = {} 

2745 

2746 # We will go through the list multiple times so must convert 

2747 # generators to lists. 

2748 refs = list(refs) 

2749 

2750 # In order to handle disassembled composites the code works 

2751 # at the records level since it can assume that internal APIs 

2752 # can be used. 

2753 # - If the record already exists in the destination this is assumed 

2754 # to be okay. 

2755 # - If there is no record but the source and destination URIs are 

2756 # identical no transfer is done but the record is added. 

2757 # - If the source record refers to an absolute URI currently assume 

2758 # that that URI should remain absolute and will be visible to the 

2759 # destination butler. May need to have a flag to indicate whether 

2760 # the dataset should be transferred. This will only happen if 

2761 # the detached Butler has had a local ingest. 

2762 

2763 # What we really want is all the records in the source datastore 

2764 # associated with these refs. Or derived ones if they don't exist 

2765 # in the source. 

2766 source_records = source_datastore._get_stored_records_associated_with_refs( 

2767 refs, ignore_datastore_records=True 

2768 ) 

2769 

2770 # The source dataset_ids are the keys in these records 

2771 source_ids = set(source_records) 

2772 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2773 

2774 requested_ids = {ref.id for ref in refs} 

2775 missing_ids = requested_ids - source_ids 

2776 

2777 # Missing IDs can be okay if that datastore has allowed 

2778 # gets based on file existence. Should we transfer what we can 

2779 # or complain about it and warn? 

2780 if missing_ids and not source_datastore.trustGetRequest: 

2781 raise ValueError( 

2782 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2783 ) 

2784 

2785 # Need to map these missing IDs to a DatasetRef so we can guess 

2786 # the details. 

2787 if missing_ids: 

2788 log.info( 

2789 "Number of expected datasets missing from source datastore records: %d out of %d", 

2790 len(missing_ids), 

2791 len(requested_ids), 

2792 ) 

2793 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2794 

2795 # This should be chunked in case we end up having to check 

2796 # the file store since we need some log output to show 

2797 # progress. 

2798 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2799 records = {} 

2800 for missing in missing_ids_chunk: 

2801 # Ask the source datastore where the missing artifacts 

2802 # should be. An execution butler might not know about the 

2803 # artifacts even if they are there. 

2804 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2805 records[missing] = [info for _, info in expected] 

2806 

2807 # Call the mexist helper method in case we have not already 

2808 # checked these artifacts such that artifact_existence is 

2809 # empty. This allows us to benefit from parallelism. 

2810 # datastore.mexists() itself does not give us access to the 

2811 # derived datastore record. 

2812 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2813 ref_exists = source_datastore._process_mexists_records( 

2814 id_to_ref, records, False, artifact_existence=artifact_existence 

2815 ) 

2816 

2817 # Now go through the records and propagate the ones that exist. 

2818 location_factory = source_datastore.locationFactory 

2819 for missing, record_list in records.items(): 

2820 # Skip completely if the ref does not exist. 

2821 ref = id_to_ref[missing] 

2822 if not ref_exists[ref]: 

2823 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2824 continue 

2825 # Check for file artifact to decide which parts of a 

2826 # disassembled composite do exist. If there is only a 

2827 # single record we don't even need to look because it can't 

2828 # be a composite and must exist. 

2829 if len(record_list) == 1: 

2830 dataset_records = record_list 

2831 else: 

2832 dataset_records = [ 

2833 record 

2834 for record in record_list 

2835 if artifact_existence[record.file_location(location_factory).uri] 

2836 ] 

2837 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2838 

2839 # Rely on source_records being a defaultdict. 

2840 source_records[missing].extend(dataset_records) 

2841 

2842 # See if we already have these records 

2843 target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) 

2844 

2845 # The artifacts to register 

2846 artifacts = [] 

2847 

2848 # Refs that already exist 

2849 already_present = [] 

2850 

2851 # Refs that were rejected by this datastore. 

2852 rejected = set() 

2853 

2854 # Refs that were transferred successfully. 

2855 accepted = set() 

2856 

2857 # Record each time we have done a "direct" transfer. 

2858 direct_transfers = [] 

2859 

2860 # Now can transfer the artifacts 

2861 for ref in refs: 

2862 if not self.constraints.isAcceptable(ref): 

2863 # This datastore should not be accepting this dataset. 

2864 rejected.add(ref) 

2865 continue 

2866 

2867 accepted.add(ref) 

2868 

2869 if ref.id in target_records: 

2870 # Already have an artifact for this. 

2871 already_present.append(ref) 

2872 continue 

2873 

2874 # mypy needs to know these are always resolved refs 

2875 for info in source_records[ref.id]: 

2876 source_location = info.file_location(source_datastore.locationFactory) 

2877 target_location = info.file_location(self.locationFactory) 

2878 if source_location == target_location and not source_location.pathInStore.isabs(): 

2879 # Artifact is already in the target location. 

2880 # (which is how execution butler currently runs) 

2881 pass 

2882 else: 

2883 if target_location.pathInStore.isabs(): 

2884 # Just because we can see the artifact when running 

2885 # the transfer doesn't mean it will be generally 

2886 # accessible to a user of this butler. Need to decide 

2887 # what to do about an absolute path. 

2888 if transfer == "auto": 

2889 # For "auto" transfers we allow the absolute URI 

2890 # to be recorded in the target datastore. 

2891 direct_transfers.append(source_location) 

2892 else: 

2893 # The user is explicitly requesting a transfer 

2894 # even for an absolute URI. This requires us to 

2895 # calculate the target path. 

2896 template_ref = ref 

2897 if info.component: 

2898 template_ref = ref.makeComponentRef(info.component) 

2899 target_location = self._calculate_ingested_datastore_name( 

2900 source_location.uri, 

2901 template_ref, 

2902 ) 

2903 

2904 info = info.update(path=target_location.pathInStore.path) 

2905 

2906 # Need to transfer it to the new location. 

2907 # Assume we should always overwrite. If the artifact 

2908 # is there this might indicate that a previous transfer 

2909 # was interrupted but was not able to be rolled back 

2910 # completely (eg pre-emption) so follow Datastore default 

2911 # and overwrite. 

2912 target_location.uri.transfer_from( 

2913 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2914 ) 

2915 

2916 artifacts.append((ref, info)) 

2917 

2918 if direct_transfers: 

2919 log.info( 

2920 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2921 len(direct_transfers), 

2922 "" if len(direct_transfers) == 1 else "s", 

2923 ) 

2924 

2925 # We are overwriting previous datasets that may have already 

2926 # existed. We therefore should ensure that we force the 

2927 # datastore records to agree. Note that this can potentially lead 

2928 # to difficulties if the dataset has previously been ingested 

2929 # disassembled and is somehow now assembled, or vice versa. 

2930 self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.REPLACE) 

2931 

2932 if already_present: 

2933 n_skipped = len(already_present) 

2934 log.info( 

2935 "Skipped transfer of %d dataset%s already present in datastore", 

2936 n_skipped, 

2937 "" if n_skipped == 1 else "s", 

2938 ) 

2939 

2940 return accepted, rejected 

2941 

2942 @transactional 

2943 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2944 # Docstring inherited. 

2945 refs = list(refs) 

2946 self.bridge.forget(refs) 

2947 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2948 

2949 def validateConfiguration( 

2950 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2951 ) -> None: 

2952 """Validate some of the configuration for this datastore. 

2953 

2954 Parameters 

2955 ---------- 

2956 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2957 Entities to test against this configuration. Can be differing 

2958 types. 

2959 logFailures : `bool`, optional 

2960 If `True`, output a log message for every validation error 

2961 detected. 

2962 

2963 Raises 

2964 ------ 

2965 DatastoreValidationError 

2966 Raised if there is a validation problem with a configuration. 

2967 All the problems are reported in a single exception. 

2968 

2969 Notes 

2970 ----- 

2971 This method checks that all the supplied entities have valid file 

2972 templates and also have formatters defined. 

2973 """ 

2974 templateFailed = None 

2975 try: 

2976 self.templates.validateTemplates(entities, logFailures=logFailures) 

2977 except FileTemplateValidationError as e: 

2978 templateFailed = str(e) 

2979 

2980 formatterFailed = [] 

2981 for entity in entities: 

2982 try: 

2983 self.formatterFactory.getFormatterClass(entity) 

2984 except KeyError as e: 

2985 formatterFailed.append(str(e)) 

2986 if logFailures: 

2987 log.critical("Formatter failure: %s", e) 

2988 

2989 if templateFailed or formatterFailed: 

2990 messages = [] 

2991 if templateFailed: 

2992 messages.append(templateFailed) 

2993 if formatterFailed: 

2994 messages.append(",".join(formatterFailed)) 

2995 msg = ";\n".join(messages) 

2996 raise DatastoreValidationError(msg) 

2997 

2998 def getLookupKeys(self) -> set[LookupKey]: 

2999 # Docstring is inherited from base class 

3000 return ( 

3001 self.templates.getLookupKeys() 

3002 | self.formatterFactory.getLookupKeys() 

3003 | self.constraints.getLookupKeys() 

3004 ) 

3005 

3006 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

3007 # Docstring is inherited from base class 

3008 # The key can be valid in either formatters or templates so we can 

3009 # only check the template if it exists 

3010 if lookupKey in self.templates: 

3011 try: 

3012 self.templates[lookupKey].validateTemplate(entity) 

3013 except FileTemplateValidationError as e: 

3014 raise DatastoreValidationError(e) from e 

3015 

3016 def export( 

3017 self, 

3018 refs: Iterable[DatasetRef], 

3019 *, 

3020 directory: ResourcePathExpression | None = None, 

3021 transfer: str | None = "auto", 

3022 ) -> Iterable[FileDataset]: 

3023 # Docstring inherited from Datastore.export. 

3024 if transfer == "auto" and directory is None: 

3025 transfer = None 

3026 

3027 if transfer is not None and directory is None: 

3028 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

3029 

3030 if transfer == "move": 

3031 raise TypeError("Can not export by moving files out of datastore.") 

3032 elif transfer == "direct": 

3033 # For an export, treat this as equivalent to None. We do not 

3034 # want an import to risk using absolute URIs to datasets owned 

3035 # by another datastore. 

3036 log.info("Treating 'direct' transfer mode as in-place export.") 

3037 transfer = None 

3038 

3039 # Force the directory to be a URI object 

3040 directoryUri: ResourcePath | None = None 

3041 if directory is not None: 

3042 directoryUri = ResourcePath(directory, forceDirectory=True) 

3043 

3044 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

3045 # mypy needs the second test 

3046 raise FileNotFoundError(f"Export location {directory} does not exist") 

3047 

3048 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

3049 for ref in progress.wrap(refs, "Exporting dataset files"): 

3050 fileLocations = self._get_dataset_locations_info(ref) 

3051 if not fileLocations: 

3052 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

3053 # For now we can not export disassembled datasets 

3054 if len(fileLocations) > 1: 

3055 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

3056 location, storedFileInfo = fileLocations[0] 

3057 

3058 pathInStore = location.pathInStore.path 

3059 if transfer is None: 

3060 # TODO: do we also need to return the readStorageClass somehow? 

3061 # We will use the path in store directly. If this is an 

3062 # absolute URI, preserve it. 

3063 if location.pathInStore.isabs(): 

3064 pathInStore = str(location.uri) 

3065 elif transfer == "direct": 

3066 # Use full URIs to the remote store in the export 

3067 pathInStore = str(location.uri) 

3068 else: 

3069 # mypy needs help 

3070 assert directoryUri is not None, "directoryUri must be defined to get here" 

3071 storeUri = ResourcePath(location.uri) 

3072 

3073 # if the datastore has an absolute URI to a resource, we 

3074 # have two options: 

3075 # 1. Keep the absolute URI in the exported YAML 

3076 # 2. Allocate a new name in the local datastore and transfer 

3077 # it. 

3078 # For now go with option 2 

3079 if location.pathInStore.isabs(): 

3080 template = self.templates.getTemplate(ref) 

3081 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

3082 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

3083 

3084 exportUri = directoryUri.join(pathInStore) 

3085 exportUri.transfer_from(storeUri, transfer=transfer) 

3086 

3087 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

3088 

3089 @staticmethod 

3090 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

3091 """Compute the checksum of the supplied file. 

3092 

3093 Parameters 

3094 ---------- 

3095 uri : `lsst.resources.ResourcePath` 

3096 Name of resource to calculate checksum from. 

3097 algorithm : `str`, optional 

3098 Name of algorithm to use. Must be one of the algorithms supported 

3099 by :py:class`hashlib`. 

3100 block_size : `int` 

3101 Number of bytes to read from file at one time. 

3102 

3103 Returns 

3104 ------- 

3105 hexdigest : `str` 

3106 Hex digest of the file. 

3107 

3108 Notes 

3109 ----- 

3110 Currently returns None if the URI is for a remote resource. 

3111 """ 

3112 if algorithm not in hashlib.algorithms_guaranteed: 

3113 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

3114 

3115 if not uri.isLocal: 

3116 return None 

3117 

3118 hasher = hashlib.new(algorithm) 

3119 

3120 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

3121 for chunk in iter(lambda: f.read(block_size), b""): 

3122 hasher.update(chunk) 

3123 

3124 return hasher.hexdigest() 

3125 

3126 def needs_expanded_data_ids( 

3127 self, 

3128 transfer: str | None, 

3129 entity: DatasetRef | DatasetType | StorageClass | None = None, 

3130 ) -> bool: 

3131 # Docstring inherited. 

3132 # This _could_ also use entity to inspect whether the filename template 

3133 # involves placeholders other than the required dimensions for its 

3134 # dataset type, but that's not necessary for correctness; it just 

3135 # enables more optimizations (perhaps only in theory). 

3136 return transfer not in ("direct", None) 

3137 

3138 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

3139 # Docstring inherited from the base class. 

3140 record_data = data.get(self.name) 

3141 if not record_data: 

3142 return 

3143 

3144 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

3145 

3146 # TODO: Verify that there are no unexpected table names in the dict? 

3147 unpacked_records = [] 

3148 for dataset_id, dataset_data in record_data.records.items(): 

3149 records = dataset_data.get(self._table.name) 

3150 if records: 

3151 for info in records: 

3152 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

3153 unpacked_records.append(info.to_record(dataset_id=dataset_id)) 

3154 if unpacked_records: 

3155 self._table.insert(*unpacked_records, transaction=self._transaction) 

3156 

3157 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

3158 # Docstring inherited from the base class. 

3159 exported_refs = list(self._bridge.check(refs)) 

3160 ids = {ref.id for ref in exported_refs} 

3161 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

3162 for row in self._table.fetch(dataset_id=ids): 

3163 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

3164 dataset_records = records.setdefault(row["dataset_id"], {}) 

3165 dataset_records.setdefault(self._table.name, []).append(info) 

3166 

3167 record_data = DatastoreRecordData(records=records) 

3168 return {self.name: record_data} 

3169 

3170 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

3171 # Docstring inherited from the base class. 

3172 self._retrieve_dataset_method = method 

3173 

3174 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

3175 """Update dataset reference to use the storage class from registry.""" 

3176 if self._retrieve_dataset_method is None: 

3177 # We could raise an exception here but unit tests do not define 

3178 # this method. 

3179 return ref 

3180 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

3181 if dataset_type is not None: 

3182 ref = ref.overrideStorageClass(dataset_type.storageClass) 

3183 return ref 

3184 

3185 def get_opaque_table_definitions(self) -> Mapping[str, DatastoreOpaqueTable]: 

3186 # Docstring inherited from the base class. 

3187 return {self._opaque_table_name: DatastoreOpaqueTable(self.makeTableSpec(ddl.GUID), StoredFileInfo)}