Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 8%

974 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-14 09:11 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from collections.abc import Callable, Iterable, Mapping, Sequence 

31from dataclasses import dataclass 

32from typing import TYPE_CHECKING, Any, ClassVar 

33 

34from lsst.daf.butler import ( 

35 CompositesMap, 

36 Config, 

37 DatasetId, 

38 DatasetRef, 

39 DatasetRefURIs, 

40 DatasetType, 

41 DatasetTypeNotSupportedError, 

42 Datastore, 

43 DatastoreCacheManager, 

44 DatastoreConfig, 

45 DatastoreDisabledCacheManager, 

46 DatastoreRecordData, 

47 DatastoreValidationError, 

48 FileDataset, 

49 FileDescriptor, 

50 FileTemplates, 

51 FileTemplateValidationError, 

52 Formatter, 

53 FormatterFactory, 

54 Location, 

55 LocationFactory, 

56 Progress, 

57 StorageClass, 

58 StoredDatastoreItemInfo, 

59 StoredFileInfo, 

60 ddl, 

61) 

62from lsst.daf.butler.core.repoRelocation import replaceRoot 

63from lsst.daf.butler.core.utils import transactional 

64from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

65from lsst.resources import ResourcePath, ResourcePathExpression 

66from lsst.utils.introspection import get_class_of, get_instance_of 

67from lsst.utils.iteration import chunk_iterable 

68 

69# For VERBOSE logging usage. 

70from lsst.utils.logging import VERBOSE, getLogger 

71from lsst.utils.timer import time_this 

72from sqlalchemy import BigInteger, String 

73 

74from ..registry.interfaces import FakeDatasetRef 

75from .genericDatastore import GenericBaseDatastore 

76 

77if TYPE_CHECKING: 

78 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

79 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

80 

81log = getLogger(__name__) 

82 

83 

84class _IngestPrepData(Datastore.IngestPrepData): 

85 """Helper class for FileDatastore ingest implementation. 

86 

87 Parameters 

88 ---------- 

89 datasets : `~collections.abc.Iterable` of `FileDataset` 

90 Files to be ingested by this datastore. 

91 """ 

92 

93 def __init__(self, datasets: Iterable[FileDataset]): 

94 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

95 self.datasets = datasets 

96 

97 

98@dataclass(frozen=True) 

99class DatastoreFileGetInformation: 

100 """Collection of useful parameters needed to retrieve a file from 

101 a Datastore. 

102 """ 

103 

104 location: Location 

105 """The location from which to read the dataset.""" 

106 

107 formatter: Formatter 

108 """The `Formatter` to use to deserialize the dataset.""" 

109 

110 info: StoredFileInfo 

111 """Stored information about this file and its formatter.""" 

112 

113 assemblerParams: Mapping[str, Any] 

114 """Parameters to use for post-processing the retrieved dataset.""" 

115 

116 formatterParams: Mapping[str, Any] 

117 """Parameters that were understood by the associated formatter.""" 

118 

119 component: str | None 

120 """The component to be retrieved (can be `None`).""" 

121 

122 readStorageClass: StorageClass 

123 """The `StorageClass` of the dataset being read.""" 

124 

125 

126class FileDatastore(GenericBaseDatastore): 

127 """Generic Datastore for file-based implementations. 

128 

129 Should always be sub-classed since key abstract methods are missing. 

130 

131 Parameters 

132 ---------- 

133 config : `DatastoreConfig` or `str` 

134 Configuration as either a `Config` object or URI to file. 

135 bridgeManager : `DatastoreRegistryBridgeManager` 

136 Object that manages the interface between `Registry` and datastores. 

137 butlerRoot : `str`, optional 

138 New datastore root to use to override the configuration value. 

139 

140 Raises 

141 ------ 

142 ValueError 

143 If root location does not exist and ``create`` is `False` in the 

144 configuration. 

145 """ 

146 

147 defaultConfigFile: ClassVar[str | None] = None 

148 """Path to configuration defaults. Accessed within the ``config`` resource 

149 or relative to a search path. Can be None if no defaults specified. 

150 """ 

151 

152 root: ResourcePath 

153 """Root directory URI of this `Datastore`.""" 

154 

155 locationFactory: LocationFactory 

156 """Factory for creating locations relative to the datastore root.""" 

157 

158 formatterFactory: FormatterFactory 

159 """Factory for creating instances of formatters.""" 

160 

161 templates: FileTemplates 

162 """File templates that can be used by this `Datastore`.""" 

163 

164 composites: CompositesMap 

165 """Determines whether a dataset should be disassembled on put.""" 

166 

167 defaultConfigFile = "datastores/fileDatastore.yaml" 

168 """Path to configuration defaults. Accessed within the ``config`` resource 

169 or relative to a search path. Can be None if no defaults specified. 

170 """ 

171 

172 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

173 """Callable that is used in trusted mode to retrieve registry definition 

174 of a named dataset type. 

175 """ 

176 

177 @classmethod 

178 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

179 """Set any filesystem-dependent config options for this Datastore to 

180 be appropriate for a new empty repository with the given root. 

181 

182 Parameters 

183 ---------- 

184 root : `str` 

185 URI to the root of the data repository. 

186 config : `Config` 

187 A `Config` to update. Only the subset understood by 

188 this component will be updated. Will not expand 

189 defaults. 

190 full : `Config` 

191 A complete config with all defaults expanded that can be 

192 converted to a `DatastoreConfig`. Read-only and will not be 

193 modified by this method. 

194 Repository-specific options that should not be obtained 

195 from defaults when Butler instances are constructed 

196 should be copied from ``full`` to ``config``. 

197 overwrite : `bool`, optional 

198 If `False`, do not modify a value in ``config`` if the value 

199 already exists. Default is always to overwrite with the provided 

200 ``root``. 

201 

202 Notes 

203 ----- 

204 If a keyword is explicitly defined in the supplied ``config`` it 

205 will not be overridden by this method if ``overwrite`` is `False`. 

206 This allows explicit values set in external configs to be retained. 

207 """ 

208 Config.updateParameters( 

209 DatastoreConfig, 

210 config, 

211 full, 

212 toUpdate={"root": root}, 

213 toCopy=("cls", ("records", "table")), 

214 overwrite=overwrite, 

215 ) 

216 

217 @classmethod 

218 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

219 return ddl.TableSpec( 

220 fields=[ 

221 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

222 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

223 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

224 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

225 # Use empty string to indicate no component 

226 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

227 # TODO: should checksum be Base64Bytes instead? 

228 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

229 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

230 ], 

231 unique=frozenset(), 

232 indexes=[ddl.IndexSpec("path")], 

233 ) 

234 

235 def __init__( 

236 self, 

237 config: DatastoreConfig | ResourcePathExpression, 

238 bridgeManager: DatastoreRegistryBridgeManager, 

239 butlerRoot: str | None = None, 

240 ): 

241 super().__init__(config, bridgeManager) 

242 if "root" not in self.config: 

243 raise ValueError("No root directory specified in configuration") 

244 

245 self._bridgeManager = bridgeManager 

246 

247 # Name ourselves either using an explicit name or a name 

248 # derived from the (unexpanded) root 

249 if "name" in self.config: 

250 self.name = self.config["name"] 

251 else: 

252 # We use the unexpanded root in the name to indicate that this 

253 # datastore can be moved without having to update registry. 

254 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

255 

256 # Support repository relocation in config 

257 # Existence of self.root is checked in subclass 

258 self.root = ResourcePath( 

259 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

260 ) 

261 

262 self.locationFactory = LocationFactory(self.root) 

263 self.formatterFactory = FormatterFactory() 

264 

265 # Now associate formatters with storage classes 

266 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

267 

268 # Read the file naming templates 

269 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

270 

271 # See if composites should be disassembled 

272 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

273 

274 tableName = self.config["records", "table"] 

275 try: 

276 # Storage of paths and formatters, keyed by dataset_id 

277 self._table = bridgeManager.opaque.register( 

278 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

279 ) 

280 # Interface to Registry. 

281 self._bridge = bridgeManager.register(self.name) 

282 except ReadOnlyDatabaseError: 

283 # If the database is read only and we just tried and failed to 

284 # create a table, it means someone is trying to create a read-only 

285 # butler client for an empty repo. That should be okay, as long 

286 # as they then try to get any datasets before some other client 

287 # creates the table. Chances are they'rejust validating 

288 # configuration. 

289 pass 

290 

291 # Determine whether checksums should be used - default to False 

292 self.useChecksum = self.config.get("checksum", False) 

293 

294 # Determine whether we can fall back to configuration if a 

295 # requested dataset is not known to registry 

296 self.trustGetRequest = self.config.get("trust_get_request", False) 

297 

298 # Create a cache manager 

299 self.cacheManager: AbstractDatastoreCacheManager 

300 if "cached" in self.config: 

301 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

302 else: 

303 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

304 

305 # Check existence and create directory structure if necessary 

306 if not self.root.exists(): 

307 if "create" not in self.config or not self.config["create"]: 

308 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

309 try: 

310 self.root.mkdir() 

311 except Exception as e: 

312 raise ValueError( 

313 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

314 ) from e 

315 

316 def __str__(self) -> str: 

317 return str(self.root) 

318 

319 @property 

320 def bridge(self) -> DatastoreRegistryBridge: 

321 return self._bridge 

322 

323 def _artifact_exists(self, location: Location) -> bool: 

324 """Check that an artifact exists in this datastore at the specified 

325 location. 

326 

327 Parameters 

328 ---------- 

329 location : `Location` 

330 Expected location of the artifact associated with this datastore. 

331 

332 Returns 

333 ------- 

334 exists : `bool` 

335 True if the location can be found, false otherwise. 

336 """ 

337 log.debug("Checking if resource exists: %s", location.uri) 

338 return location.uri.exists() 

339 

340 def _delete_artifact(self, location: Location) -> None: 

341 """Delete the artifact from the datastore. 

342 

343 Parameters 

344 ---------- 

345 location : `Location` 

346 Location of the artifact associated with this datastore. 

347 """ 

348 if location.pathInStore.isabs(): 

349 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

350 

351 try: 

352 location.uri.remove() 

353 except FileNotFoundError: 

354 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

355 raise 

356 except Exception as e: 

357 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

358 raise 

359 log.debug("Successfully deleted file: %s", location.uri) 

360 

361 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

362 # Docstring inherited from GenericBaseDatastore 

363 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)] 

364 self._table.insert(*records, transaction=self._transaction) 

365 

366 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]: 

367 # Docstring inherited from GenericBaseDatastore 

368 

369 # Look for the dataset_id -- there might be multiple matches 

370 # if we have disassembled the dataset. 

371 records = self._table.fetch(dataset_id=ref.id) 

372 return [StoredFileInfo.from_record(record) for record in records] 

373 

374 def _get_stored_records_associated_with_refs( 

375 self, refs: Iterable[DatasetIdRef] 

376 ) -> dict[DatasetId, list[StoredFileInfo]]: 

377 """Retrieve all records associated with the provided refs. 

378 

379 Parameters 

380 ---------- 

381 refs : iterable of `DatasetIdRef` 

382 The refs for which records are to be retrieved. 

383 

384 Returns 

385 ------- 

386 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

387 The matching records indexed by the ref ID. The number of entries 

388 in the dict can be smaller than the number of requested refs. 

389 """ 

390 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

391 

392 # Uniqueness is dataset_id + component so can have multiple records 

393 # per ref. 

394 records_by_ref = defaultdict(list) 

395 for record in records: 

396 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

397 return records_by_ref 

398 

399 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

400 """Return paths and associated dataset refs. 

401 

402 Parameters 

403 ---------- 

404 paths : `list` of `str` or `lsst.resources.ResourcePath` 

405 All the paths to include in search. 

406 

407 Returns 

408 ------- 

409 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

410 Mapping of each path to a set of associated database IDs. 

411 """ 

412 records = self._table.fetch(path=[str(path) for path in paths]) 

413 result = defaultdict(set) 

414 for row in records: 

415 result[row["path"]].add(row["dataset_id"]) 

416 return result 

417 

418 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

419 """Return all dataset refs associated with the supplied path. 

420 

421 Parameters 

422 ---------- 

423 pathInStore : `lsst.resources.ResourcePath` 

424 Path of interest in the data store. 

425 

426 Returns 

427 ------- 

428 ids : `set` of `int` 

429 All `DatasetRef` IDs associated with this path. 

430 """ 

431 records = list(self._table.fetch(path=str(pathInStore))) 

432 ids = {r["dataset_id"] for r in records} 

433 return ids 

434 

435 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

436 # Docstring inherited from GenericBaseDatastore 

437 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

438 

439 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]: 

440 r"""Find all the `Location`\ s of the requested dataset in the 

441 `Datastore` and the associated stored file information. 

442 

443 Parameters 

444 ---------- 

445 ref : `DatasetRef` 

446 Reference to the required `Dataset`. 

447 

448 Returns 

449 ------- 

450 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

451 Location of the dataset within the datastore and 

452 stored information about each file and its formatter. 

453 """ 

454 # Get the file information (this will fail if no file) 

455 records = self.getStoredItemsInfo(ref) 

456 

457 # Use the path to determine the location -- we need to take 

458 # into account absolute URIs in the datastore record 

459 return [(r.file_location(self.locationFactory), r) for r in records] 

460 

461 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

462 """Check that there is only one dataset associated with the 

463 specified artifact. 

464 

465 Parameters 

466 ---------- 

467 ref : `DatasetRef` or `FakeDatasetRef` 

468 Dataset to be removed. 

469 location : `Location` 

470 The location of the artifact to be removed. 

471 

472 Returns 

473 ------- 

474 can_remove : `Bool` 

475 True if the artifact can be safely removed. 

476 """ 

477 # Can't ever delete absolute URIs. 

478 if location.pathInStore.isabs(): 

479 return False 

480 

481 # Get all entries associated with this path 

482 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

483 if not allRefs: 

484 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

485 

486 # Remove these refs from all the refs and if there is nothing left 

487 # then we can delete 

488 remainingRefs = allRefs - {ref.id} 

489 

490 if remainingRefs: 

491 return False 

492 return True 

493 

494 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

495 """Predict the location and related file information of the requested 

496 dataset in this datastore. 

497 

498 Parameters 

499 ---------- 

500 ref : `DatasetRef` 

501 Reference to the required `Dataset`. 

502 

503 Returns 

504 ------- 

505 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

506 Expected Location of the dataset within the datastore and 

507 placeholder information about each file and its formatter. 

508 

509 Notes 

510 ----- 

511 Uses the current configuration to determine how we would expect the 

512 datastore files to have been written if we couldn't ask registry. 

513 This is safe so long as there has been no change to datastore 

514 configuration between writing the dataset and wanting to read it. 

515 Will not work for files that have been ingested without using the 

516 standard file template or default formatter. 

517 """ 

518 

519 # If we have a component ref we always need to ask the questions 

520 # of the composite. If the composite is disassembled this routine 

521 # should return all components. If the composite was not 

522 # disassembled the composite is what is stored regardless of 

523 # component request. Note that if the caller has disassembled 

524 # a composite there is no way for this guess to know that 

525 # without trying both the composite and component ref and seeing 

526 # if there is something at the component Location even without 

527 # disassembly being enabled. 

528 if ref.datasetType.isComponent(): 

529 ref = ref.makeCompositeRef() 

530 

531 # See if the ref is a composite that should be disassembled 

532 doDisassembly = self.composites.shouldBeDisassembled(ref) 

533 

534 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

535 

536 if doDisassembly: 

537 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

538 compRef = ref.makeComponentRef(component) 

539 location, formatter = self._determine_put_formatter_location(compRef) 

540 all_info.append((location, formatter, componentStorage, component)) 

541 

542 else: 

543 # Always use the composite ref if no disassembly 

544 location, formatter = self._determine_put_formatter_location(ref) 

545 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

546 

547 # Convert the list of tuples to have StoredFileInfo as second element 

548 return [ 

549 ( 

550 location, 

551 StoredFileInfo( 

552 formatter=formatter, 

553 path=location.pathInStore.path, 

554 storageClass=storageClass, 

555 component=component, 

556 checksum=None, 

557 file_size=-1, 

558 dataset_id=ref.id, 

559 ), 

560 ) 

561 for location, formatter, storageClass, component in all_info 

562 ] 

563 

564 def _prepare_for_get( 

565 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

566 ) -> list[DatastoreFileGetInformation]: 

567 """Check parameters for ``get`` and obtain formatter and 

568 location. 

569 

570 Parameters 

571 ---------- 

572 ref : `DatasetRef` 

573 Reference to the required Dataset. 

574 parameters : `dict` 

575 `StorageClass`-specific parameters that specify, for example, 

576 a slice of the dataset to be loaded. 

577 

578 Returns 

579 ------- 

580 getInfo : `list` [`DatastoreFileGetInformation`] 

581 Parameters needed to retrieve each file. 

582 """ 

583 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

584 

585 # The storage class we want to use eventually 

586 refStorageClass = ref.datasetType.storageClass 

587 

588 # For trusted mode need to reset storage class. 

589 ref = self._cast_storage_class(ref) 

590 

591 # Get file metadata and internal metadata 

592 fileLocations = self._get_dataset_locations_info(ref) 

593 if not fileLocations: 

594 if not self.trustGetRequest: 

595 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

596 # Assume the dataset is where we think it should be 

597 fileLocations = self._get_expected_dataset_locations_info(ref) 

598 

599 if len(fileLocations) > 1: 

600 disassembled = True 

601 

602 # If trust is involved it is possible that there will be 

603 # components listed here that do not exist in the datastore. 

604 # Explicitly check for file artifact existence and filter out any 

605 # that are missing. 

606 if self.trustGetRequest: 

607 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

608 

609 # For now complain only if we have no components at all. One 

610 # component is probably a problem but we can punt that to the 

611 # assembler. 

612 if not fileLocations: 

613 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

614 

615 else: 

616 disassembled = False 

617 

618 # Is this a component request? 

619 refComponent = ref.datasetType.component() 

620 

621 fileGetInfo = [] 

622 for location, storedFileInfo in fileLocations: 

623 # The storage class used to write the file 

624 writeStorageClass = storedFileInfo.storageClass 

625 

626 # If this has been disassembled we need read to match the write 

627 if disassembled: 

628 readStorageClass = writeStorageClass 

629 else: 

630 readStorageClass = refStorageClass 

631 

632 formatter = get_instance_of( 

633 storedFileInfo.formatter, 

634 FileDescriptor( 

635 location, 

636 readStorageClass=readStorageClass, 

637 storageClass=writeStorageClass, 

638 parameters=parameters, 

639 ), 

640 ref.dataId, 

641 ) 

642 

643 formatterParams, notFormatterParams = formatter.segregateParameters() 

644 

645 # Of the remaining parameters, extract the ones supported by 

646 # this StorageClass (for components not all will be handled) 

647 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

648 

649 # The ref itself could be a component if the dataset was 

650 # disassembled by butler, or we disassembled in datastore and 

651 # components came from the datastore records 

652 component = storedFileInfo.component if storedFileInfo.component else refComponent 

653 

654 fileGetInfo.append( 

655 DatastoreFileGetInformation( 

656 location, 

657 formatter, 

658 storedFileInfo, 

659 assemblerParams, 

660 formatterParams, 

661 component, 

662 readStorageClass, 

663 ) 

664 ) 

665 

666 return fileGetInfo 

667 

668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

669 """Check the arguments for ``put`` and obtain formatter and 

670 location. 

671 

672 Parameters 

673 ---------- 

674 inMemoryDataset : `object` 

675 The dataset to store. 

676 ref : `DatasetRef` 

677 Reference to the associated Dataset. 

678 

679 Returns 

680 ------- 

681 location : `Location` 

682 The location to write the dataset. 

683 formatter : `Formatter` 

684 The `Formatter` to use to write the dataset. 

685 

686 Raises 

687 ------ 

688 TypeError 

689 Supplied object and storage class are inconsistent. 

690 DatasetTypeNotSupportedError 

691 The associated `DatasetType` is not handled by this datastore. 

692 """ 

693 self._validate_put_parameters(inMemoryDataset, ref) 

694 return self._determine_put_formatter_location(ref) 

695 

696 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

697 """Calculate the formatter and output location to use for put. 

698 

699 Parameters 

700 ---------- 

701 ref : `DatasetRef` 

702 Reference to the associated Dataset. 

703 

704 Returns 

705 ------- 

706 location : `Location` 

707 The location to write the dataset. 

708 formatter : `Formatter` 

709 The `Formatter` to use to write the dataset. 

710 """ 

711 # Work out output file name 

712 try: 

713 template = self.templates.getTemplate(ref) 

714 except KeyError as e: 

715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

716 

717 # Validate the template to protect against filenames from different 

718 # dataIds returning the same and causing overwrite confusion. 

719 template.validateTemplate(ref) 

720 

721 location = self.locationFactory.fromPath(template.format(ref)) 

722 

723 # Get the formatter based on the storage class 

724 storageClass = ref.datasetType.storageClass 

725 try: 

726 formatter = self.formatterFactory.getFormatter( 

727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

728 ) 

729 except KeyError as e: 

730 raise DatasetTypeNotSupportedError( 

731 f"Unable to find formatter for {ref} in datastore {self.name}" 

732 ) from e 

733 

734 # Now that we know the formatter, update the location 

735 location = formatter.makeUpdatedLocation(location) 

736 

737 return location, formatter 

738 

739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

740 # Docstring inherited from base class 

741 if transfer != "auto": 

742 return transfer 

743 

744 # See if the paths are within the datastore or not 

745 inside = [self._pathInStore(d.path) is not None for d in datasets] 

746 

747 if all(inside): 

748 transfer = None 

749 elif not any(inside): 

750 # Allow ResourcePath to use its own knowledge 

751 transfer = "auto" 

752 else: 

753 # This can happen when importing from a datastore that 

754 # has had some datasets ingested using "direct" mode. 

755 # Also allow ResourcePath to sort it out but warn about it. 

756 # This can happen if you are importing from a datastore 

757 # that had some direct transfer datasets. 

758 log.warning( 

759 "Some datasets are inside the datastore and some are outside. Using 'split' " 

760 "transfer mode. This assumes that the files outside the datastore are " 

761 "still accessible to the new butler since they will not be copied into " 

762 "the target datastore." 

763 ) 

764 transfer = "split" 

765 

766 return transfer 

767 

768 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

769 """Return path relative to datastore root 

770 

771 Parameters 

772 ---------- 

773 path : `lsst.resources.ResourcePathExpression` 

774 Path to dataset. Can be absolute URI. If relative assumed to 

775 be relative to the datastore. Returns path in datastore 

776 or raises an exception if the path it outside. 

777 

778 Returns 

779 ------- 

780 inStore : `str` 

781 Path relative to datastore root. Returns `None` if the file is 

782 outside the root. 

783 """ 

784 # Relative path will always be relative to datastore 

785 pathUri = ResourcePath(path, forceAbsolute=False) 

786 return pathUri.relative_to(self.root) 

787 

788 def _standardizeIngestPath( 

789 self, path: str | ResourcePath, *, transfer: str | None = None 

790 ) -> str | ResourcePath: 

791 """Standardize the path of a to-be-ingested file. 

792 

793 Parameters 

794 ---------- 

795 path : `str` or `lsst.resources.ResourcePath` 

796 Path of a file to be ingested. This parameter is not expected 

797 to be all the types that can be used to construct a 

798 `~lsst.resources.ResourcePath`. 

799 transfer : `str`, optional 

800 How (and whether) the dataset should be added to the datastore. 

801 See `ingest` for details of transfer modes. 

802 This implementation is provided only so 

803 `NotImplementedError` can be raised if the mode is not supported; 

804 actual transfers are deferred to `_extractIngestInfo`. 

805 

806 Returns 

807 ------- 

808 path : `str` or `lsst.resources.ResourcePath` 

809 New path in what the datastore considers standard form. If an 

810 absolute URI was given that will be returned unchanged. 

811 

812 Notes 

813 ----- 

814 Subclasses of `FileDatastore` can implement this method instead 

815 of `_prepIngest`. It should not modify the data repository or given 

816 file in any way. 

817 

818 Raises 

819 ------ 

820 NotImplementedError 

821 Raised if the datastore does not support the given transfer mode 

822 (including the case where ingest is not supported at all). 

823 FileNotFoundError 

824 Raised if one of the given files does not exist. 

825 """ 

826 if transfer not in (None, "direct", "split") + self.root.transferModes: 

827 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

828 

829 # A relative URI indicates relative to datastore root 

830 srcUri = ResourcePath(path, forceAbsolute=False) 

831 if not srcUri.isabs(): 

832 srcUri = self.root.join(path) 

833 

834 if not srcUri.exists(): 

835 raise FileNotFoundError( 

836 f"Resource at {srcUri} does not exist; note that paths to ingest " 

837 f"are assumed to be relative to {self.root} unless they are absolute." 

838 ) 

839 

840 if transfer is None: 

841 relpath = srcUri.relative_to(self.root) 

842 if not relpath: 

843 raise RuntimeError( 

844 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

845 ) 

846 

847 # Return the relative path within the datastore for internal 

848 # transfer 

849 path = relpath 

850 

851 return path 

852 

853 def _extractIngestInfo( 

854 self, 

855 path: ResourcePathExpression, 

856 ref: DatasetRef, 

857 *, 

858 formatter: Formatter | type[Formatter], 

859 transfer: str | None = None, 

860 record_validation_info: bool = True, 

861 ) -> StoredFileInfo: 

862 """Relocate (if necessary) and extract `StoredFileInfo` from a 

863 to-be-ingested file. 

864 

865 Parameters 

866 ---------- 

867 path : `lsst.resources.ResourcePathExpression` 

868 URI or path of a file to be ingested. 

869 ref : `DatasetRef` 

870 Reference for the dataset being ingested. Guaranteed to have 

871 ``dataset_id not None`. 

872 formatter : `type` or `Formatter` 

873 `Formatter` subclass to use for this dataset or an instance. 

874 transfer : `str`, optional 

875 How (and whether) the dataset should be added to the datastore. 

876 See `ingest` for details of transfer modes. 

877 record_validation_info : `bool`, optional 

878 If `True`, the default, the datastore can record validation 

879 information associated with the file. If `False` the datastore 

880 will not attempt to track any information such as checksums 

881 or file sizes. This can be useful if such information is tracked 

882 in an external system or if the file is to be compressed in place. 

883 It is up to the datastore whether this parameter is relevant. 

884 

885 Returns 

886 ------- 

887 info : `StoredFileInfo` 

888 Internal datastore record for this file. This will be inserted by 

889 the caller; the `_extractIngestInfo` is only responsible for 

890 creating and populating the struct. 

891 

892 Raises 

893 ------ 

894 FileNotFoundError 

895 Raised if one of the given files does not exist. 

896 FileExistsError 

897 Raised if transfer is not `None` but the (internal) location the 

898 file would be moved to is already occupied. 

899 """ 

900 if self._transaction is None: 

901 raise RuntimeError("Ingest called without transaction enabled") 

902 

903 # Create URI of the source path, do not need to force a relative 

904 # path to absolute. 

905 srcUri = ResourcePath(path, forceAbsolute=False) 

906 

907 # Track whether we have read the size of the source yet 

908 have_sized = False 

909 

910 tgtLocation: Location | None 

911 if transfer is None or transfer == "split": 

912 # A relative path is assumed to be relative to the datastore 

913 # in this context 

914 if not srcUri.isabs(): 

915 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

916 else: 

917 # Work out the path in the datastore from an absolute URI 

918 # This is required to be within the datastore. 

919 pathInStore = srcUri.relative_to(self.root) 

920 if pathInStore is None and transfer is None: 

921 raise RuntimeError( 

922 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

923 ) 

924 if pathInStore: 

925 tgtLocation = self.locationFactory.fromPath(pathInStore) 

926 elif transfer == "split": 

927 # Outside the datastore but treat that as a direct ingest 

928 # instead. 

929 tgtLocation = None 

930 else: 

931 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

932 elif transfer == "direct": 

933 # Want to store the full URI to the resource directly in 

934 # datastore. This is useful for referring to permanent archive 

935 # storage for raw data. 

936 # Trust that people know what they are doing. 

937 tgtLocation = None 

938 else: 

939 # Work out the name we want this ingested file to have 

940 # inside the datastore 

941 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

942 if not tgtLocation.uri.dirname().exists(): 

943 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

944 tgtLocation.uri.dirname().mkdir() 

945 

946 # if we are transferring from a local file to a remote location 

947 # it may be more efficient to get the size and checksum of the 

948 # local file rather than the transferred one 

949 if record_validation_info and srcUri.isLocal: 

950 size = srcUri.size() 

951 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

952 have_sized = True 

953 

954 # Transfer the resource to the destination. 

955 # Allow overwrite of an existing file. This matches the behavior 

956 # of datastore.put() in that it trusts that registry would not 

957 # be asking to overwrite unless registry thought that the 

958 # overwrite was allowed. 

959 tgtLocation.uri.transfer_from( 

960 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

961 ) 

962 

963 if tgtLocation is None: 

964 # This means we are using direct mode 

965 targetUri = srcUri 

966 targetPath = str(srcUri) 

967 else: 

968 targetUri = tgtLocation.uri 

969 targetPath = tgtLocation.pathInStore.path 

970 

971 # the file should exist in the datastore now 

972 if record_validation_info: 

973 if not have_sized: 

974 size = targetUri.size() 

975 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

976 else: 

977 # Not recording any file information. 

978 size = -1 

979 checksum = None 

980 

981 return StoredFileInfo( 

982 formatter=formatter, 

983 path=targetPath, 

984 storageClass=ref.datasetType.storageClass, 

985 component=ref.datasetType.component(), 

986 file_size=size, 

987 checksum=checksum, 

988 dataset_id=ref.id, 

989 ) 

990 

991 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

992 # Docstring inherited from Datastore._prepIngest. 

993 filtered = [] 

994 for dataset in datasets: 

995 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

996 if not acceptable: 

997 continue 

998 else: 

999 dataset.refs = acceptable 

1000 if dataset.formatter is None: 

1001 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1002 else: 

1003 assert isinstance(dataset.formatter, (type, str)) 

1004 formatter_class = get_class_of(dataset.formatter) 

1005 if not issubclass(formatter_class, Formatter): 

1006 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1007 dataset.formatter = formatter_class 

1008 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1009 filtered.append(dataset) 

1010 return _IngestPrepData(filtered) 

1011 

1012 @transactional 

1013 def _finishIngest( 

1014 self, 

1015 prepData: Datastore.IngestPrepData, 

1016 *, 

1017 transfer: str | None = None, 

1018 record_validation_info: bool = True, 

1019 ) -> None: 

1020 # Docstring inherited from Datastore._finishIngest. 

1021 refsAndInfos = [] 

1022 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1023 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1024 # Do ingest as if the first dataset ref is associated with the file 

1025 info = self._extractIngestInfo( 

1026 dataset.path, 

1027 dataset.refs[0], 

1028 formatter=dataset.formatter, 

1029 transfer=transfer, 

1030 record_validation_info=record_validation_info, 

1031 ) 

1032 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1033 self._register_datasets(refsAndInfos) 

1034 

1035 def _calculate_ingested_datastore_name( 

1036 self, 

1037 srcUri: ResourcePath, 

1038 ref: DatasetRef, 

1039 formatter: Formatter | type[Formatter] | None = None, 

1040 ) -> Location: 

1041 """Given a source URI and a DatasetRef, determine the name the 

1042 dataset will have inside datastore. 

1043 

1044 Parameters 

1045 ---------- 

1046 srcUri : `lsst.resources.ResourcePath` 

1047 URI to the source dataset file. 

1048 ref : `DatasetRef` 

1049 Ref associated with the newly-ingested dataset artifact. This 

1050 is used to determine the name within the datastore. 

1051 formatter : `Formatter` or Formatter class. 

1052 Formatter to use for validation. Can be a class or an instance. 

1053 No validation of the file extension is performed if the 

1054 ``formatter`` is `None`. This can be used if the caller knows 

1055 that the source URI and target URI will use the same formatter. 

1056 

1057 Returns 

1058 ------- 

1059 location : `Location` 

1060 Target location for the newly-ingested dataset. 

1061 """ 

1062 # Ingesting a file from outside the datastore. 

1063 # This involves a new name. 

1064 template = self.templates.getTemplate(ref) 

1065 location = self.locationFactory.fromPath(template.format(ref)) 

1066 

1067 # Get the extension 

1068 ext = srcUri.getExtension() 

1069 

1070 # Update the destination to include that extension 

1071 location.updateExtension(ext) 

1072 

1073 # Ask the formatter to validate this extension 

1074 if formatter is not None: 

1075 formatter.validateExtension(location) 

1076 

1077 return location 

1078 

1079 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1080 """Write out in memory dataset to datastore. 

1081 

1082 Parameters 

1083 ---------- 

1084 inMemoryDataset : `object` 

1085 Dataset to write to datastore. 

1086 ref : `DatasetRef` 

1087 Registry information associated with this dataset. 

1088 

1089 Returns 

1090 ------- 

1091 info : `StoredFileInfo` 

1092 Information describing the artifact written to the datastore. 

1093 """ 

1094 # May need to coerce the in memory dataset to the correct 

1095 # python type, but first we need to make sure the storage class 

1096 # reflects the one defined in the data repository. 

1097 ref = self._cast_storage_class(ref) 

1098 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1099 

1100 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1101 uri = location.uri 

1102 

1103 if not uri.dirname().exists(): 

1104 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1105 uri.dirname().mkdir() 

1106 

1107 if self._transaction is None: 

1108 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1109 

1110 def _removeFileExists(uri: ResourcePath) -> None: 

1111 """Remove a file and do not complain if it is not there. 

1112 

1113 This is important since a formatter might fail before the file 

1114 is written and we should not confuse people by writing spurious 

1115 error messages to the log. 

1116 """ 

1117 try: 

1118 uri.remove() 

1119 except FileNotFoundError: 

1120 pass 

1121 

1122 # Register a callback to try to delete the uploaded data if 

1123 # something fails below 

1124 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1125 

1126 data_written = False 

1127 if not uri.isLocal: 

1128 # This is a remote URI. Some datasets can be serialized directly 

1129 # to bytes and sent to the remote datastore without writing a 

1130 # file. If the dataset is intended to be saved to the cache 

1131 # a file is always written and direct write to the remote 

1132 # datastore is bypassed. 

1133 if not self.cacheManager.should_be_cached(ref): 

1134 try: 

1135 serializedDataset = formatter.toBytes(inMemoryDataset) 

1136 except NotImplementedError: 

1137 # Fallback to the file writing option. 

1138 pass 

1139 except Exception as e: 

1140 raise RuntimeError( 

1141 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1142 ) from e 

1143 else: 

1144 log.debug("Writing bytes directly to %s", uri) 

1145 uri.write(serializedDataset, overwrite=True) 

1146 log.debug("Successfully wrote bytes directly to %s", uri) 

1147 data_written = True 

1148 

1149 if not data_written: 

1150 # Did not write the bytes directly to object store so instead 

1151 # write to temporary file. Always write to a temporary even if 

1152 # using a local file system -- that gives us atomic writes. 

1153 # If a process is killed as the file is being written we do not 

1154 # want it to remain in the correct place but in corrupt state. 

1155 # For local files write to the output directory not temporary dir. 

1156 prefix = uri.dirname() if uri.isLocal else None 

1157 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1158 # Need to configure the formatter to write to a different 

1159 # location and that needs us to overwrite internals 

1160 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1161 with formatter._updateLocation(Location(None, temporary_uri)): 

1162 try: 

1163 formatter.write(inMemoryDataset) 

1164 except Exception as e: 

1165 raise RuntimeError( 

1166 f"Failed to serialize dataset {ref} of type" 

1167 f" {type(inMemoryDataset)} to " 

1168 f"temporary location {temporary_uri}" 

1169 ) from e 

1170 

1171 # Use move for a local file since that becomes an efficient 

1172 # os.rename. For remote resources we use copy to allow the 

1173 # file to be cached afterwards. 

1174 transfer = "move" if uri.isLocal else "copy" 

1175 

1176 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1177 

1178 if transfer == "copy": 

1179 # Cache if required 

1180 self.cacheManager.move_to_cache(temporary_uri, ref) 

1181 

1182 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1183 

1184 # URI is needed to resolve what ingest case are we dealing with 

1185 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1186 

1187 def _read_artifact_into_memory( 

1188 self, 

1189 getInfo: DatastoreFileGetInformation, 

1190 ref: DatasetRef, 

1191 isComponent: bool = False, 

1192 cache_ref: DatasetRef | None = None, 

1193 ) -> Any: 

1194 """Read the artifact from datastore into in memory object. 

1195 

1196 Parameters 

1197 ---------- 

1198 getInfo : `DatastoreFileGetInformation` 

1199 Information about the artifact within the datastore. 

1200 ref : `DatasetRef` 

1201 The registry information associated with this artifact. 

1202 isComponent : `bool` 

1203 Flag to indicate if a component is being read from this artifact. 

1204 cache_ref : `DatasetRef`, optional 

1205 The DatasetRef to use when looking up the file in the cache. 

1206 This ref must have the same ID as the supplied ref but can 

1207 be a parent ref or component ref to indicate to the cache whether 

1208 a composite file is being requested from the cache or a component 

1209 file. Without this the cache will default to the supplied ref but 

1210 it can get confused with read-only derived components for 

1211 disassembled composites. 

1212 

1213 Returns 

1214 ------- 

1215 inMemoryDataset : `object` 

1216 The artifact as a python object. 

1217 """ 

1218 location = getInfo.location 

1219 uri = location.uri 

1220 log.debug("Accessing data from %s", uri) 

1221 

1222 if cache_ref is None: 

1223 cache_ref = ref 

1224 if cache_ref.id != ref.id: 

1225 raise ValueError( 

1226 "The supplied cache dataset ref refers to a different dataset than expected:" 

1227 f" {ref.id} != {cache_ref.id}" 

1228 ) 

1229 

1230 # Cannot recalculate checksum but can compare size as a quick check 

1231 # Do not do this if the size is negative since that indicates 

1232 # we do not know. 

1233 recorded_size = getInfo.info.file_size 

1234 resource_size = uri.size() 

1235 if recorded_size >= 0 and resource_size != recorded_size: 

1236 raise RuntimeError( 

1237 "Integrity failure in Datastore. " 

1238 f"Size of file {uri} ({resource_size}) " 

1239 f"does not match size recorded in registry of {recorded_size}" 

1240 ) 

1241 

1242 # For the general case we have choices for how to proceed. 

1243 # 1. Always use a local file (downloading the remote resource to a 

1244 # temporary file if needed). 

1245 # 2. Use a threshold size and read into memory and use bytes. 

1246 # Use both for now with an arbitrary hand off size. 

1247 # This allows small datasets to be downloaded from remote object 

1248 # stores without requiring a temporary file. 

1249 

1250 formatter = getInfo.formatter 

1251 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1252 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1253 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1254 if cached_file is not None: 

1255 desired_uri = cached_file 

1256 msg = f" (cached version of {uri})" 

1257 else: 

1258 desired_uri = uri 

1259 msg = "" 

1260 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1261 serializedDataset = desired_uri.read() 

1262 log.debug( 

1263 "Deserializing %s from %d bytes from location %s with formatter %s", 

1264 f"component {getInfo.component}" if isComponent else "", 

1265 len(serializedDataset), 

1266 uri, 

1267 formatter.name(), 

1268 ) 

1269 try: 

1270 result = formatter.fromBytes( 

1271 serializedDataset, component=getInfo.component if isComponent else None 

1272 ) 

1273 except Exception as e: 

1274 raise ValueError( 

1275 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1276 f" ({ref.datasetType.name} from {uri}): {e}" 

1277 ) from e 

1278 else: 

1279 # Read from file. 

1280 

1281 # Have to update the Location associated with the formatter 

1282 # because formatter.read does not allow an override. 

1283 # This could be improved. 

1284 location_updated = False 

1285 msg = "" 

1286 

1287 # First check in cache for local version. 

1288 # The cache will only be relevant for remote resources but 

1289 # no harm in always asking. Context manager ensures that cache 

1290 # file is not deleted during cache expiration. 

1291 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1292 if cached_file is not None: 

1293 msg = f"(via cache read of remote file {uri})" 

1294 uri = cached_file 

1295 location_updated = True 

1296 

1297 with uri.as_local() as local_uri: 

1298 can_be_cached = False 

1299 if uri != local_uri: 

1300 # URI was remote and file was downloaded 

1301 cache_msg = "" 

1302 location_updated = True 

1303 

1304 if self.cacheManager.should_be_cached(cache_ref): 

1305 # In this scenario we want to ask if the downloaded 

1306 # file should be cached but we should not cache 

1307 # it until after we've used it (to ensure it can't 

1308 # be expired whilst we are using it). 

1309 can_be_cached = True 

1310 

1311 # Say that it is "likely" to be cached because 

1312 # if the formatter read fails we will not be 

1313 # caching this file. 

1314 cache_msg = " and likely cached" 

1315 

1316 msg = f"(via download to local file{cache_msg})" 

1317 

1318 # Calculate the (possibly) new location for the formatter 

1319 # to use. 

1320 newLocation = Location(*local_uri.split()) if location_updated else None 

1321 

1322 log.debug( 

1323 "Reading%s from location %s %s with formatter %s", 

1324 f" component {getInfo.component}" if isComponent else "", 

1325 uri, 

1326 msg, 

1327 formatter.name(), 

1328 ) 

1329 try: 

1330 with formatter._updateLocation(newLocation): 

1331 with time_this( 

1332 log, 

1333 msg="Reading%s from location %s %s with formatter %s", 

1334 args=( 

1335 f" component {getInfo.component}" if isComponent else "", 

1336 uri, 

1337 msg, 

1338 formatter.name(), 

1339 ), 

1340 ): 

1341 result = formatter.read(component=getInfo.component if isComponent else None) 

1342 except Exception as e: 

1343 raise ValueError( 

1344 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1345 f" ({ref.datasetType.name} from {uri}): {e}" 

1346 ) from e 

1347 

1348 # File was read successfully so can move to cache 

1349 if can_be_cached: 

1350 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1351 

1352 return self._post_process_get( 

1353 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1354 ) 

1355 

1356 def knows(self, ref: DatasetRef) -> bool: 

1357 """Check if the dataset is known to the datastore. 

1358 

1359 Does not check for existence of any artifact. 

1360 

1361 Parameters 

1362 ---------- 

1363 ref : `DatasetRef` 

1364 Reference to the required dataset. 

1365 

1366 Returns 

1367 ------- 

1368 exists : `bool` 

1369 `True` if the dataset is known to the datastore. 

1370 """ 

1371 fileLocations = self._get_dataset_locations_info(ref) 

1372 if fileLocations: 

1373 return True 

1374 return False 

1375 

1376 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1377 # Docstring inherited from the base class. 

1378 

1379 # The records themselves. Could be missing some entries. 

1380 records = self._get_stored_records_associated_with_refs(refs) 

1381 

1382 return {ref: ref.id in records for ref in refs} 

1383 

1384 def _process_mexists_records( 

1385 self, 

1386 id_to_ref: dict[DatasetId, DatasetRef], 

1387 records: dict[DatasetId, list[StoredFileInfo]], 

1388 all_required: bool, 

1389 artifact_existence: dict[ResourcePath, bool] | None = None, 

1390 ) -> dict[DatasetRef, bool]: 

1391 """Helper function for mexists that checks the given records. 

1392 

1393 Parameters 

1394 ---------- 

1395 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1396 Mapping of the dataset ID to the dataset ref itself. 

1397 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1398 Records as generally returned by 

1399 ``_get_stored_records_associated_with_refs``. 

1400 all_required : `bool` 

1401 Flag to indicate whether existence requires all artifacts 

1402 associated with a dataset ID to exist or not for existence. 

1403 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1404 Optional mapping of datastore artifact to existence. Updated by 

1405 this method with details of all artifacts tested. Can be `None` 

1406 if the caller is not interested. 

1407 

1408 Returns 

1409 ------- 

1410 existence : `dict` of [`DatasetRef`, `bool`] 

1411 Mapping from dataset to boolean indicating existence. 

1412 """ 

1413 # The URIs to be checked and a mapping of those URIs to 

1414 # the dataset ID. 

1415 uris_to_check: list[ResourcePath] = [] 

1416 location_map: dict[ResourcePath, DatasetId] = {} 

1417 

1418 location_factory = self.locationFactory 

1419 

1420 uri_existence: dict[ResourcePath, bool] = {} 

1421 for ref_id, infos in records.items(): 

1422 # Key is the dataset Id, value is list of StoredItemInfo 

1423 uris = [info.file_location(location_factory).uri for info in infos] 

1424 location_map.update({uri: ref_id for uri in uris}) 

1425 

1426 # Check the local cache directly for a dataset corresponding 

1427 # to the remote URI. 

1428 if self.cacheManager.file_count > 0: 

1429 ref = id_to_ref[ref_id] 

1430 for uri, storedFileInfo in zip(uris, infos): 

1431 check_ref = ref 

1432 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1433 check_ref = ref.makeComponentRef(component) 

1434 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1435 # Proxy for URI existence. 

1436 uri_existence[uri] = True 

1437 else: 

1438 uris_to_check.append(uri) 

1439 else: 

1440 # Check all of them. 

1441 uris_to_check.extend(uris) 

1442 

1443 if artifact_existence is not None: 

1444 # If a URI has already been checked remove it from the list 

1445 # and immediately add the status to the output dict. 

1446 filtered_uris_to_check = [] 

1447 for uri in uris_to_check: 

1448 if uri in artifact_existence: 

1449 uri_existence[uri] = artifact_existence[uri] 

1450 else: 

1451 filtered_uris_to_check.append(uri) 

1452 uris_to_check = filtered_uris_to_check 

1453 

1454 # Results. 

1455 dataset_existence: dict[DatasetRef, bool] = {} 

1456 

1457 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1458 for uri, exists in uri_existence.items(): 

1459 dataset_id = location_map[uri] 

1460 ref = id_to_ref[dataset_id] 

1461 

1462 # Disassembled composite needs to check all locations. 

1463 # all_required indicates whether all need to exist or not. 

1464 if ref in dataset_existence: 

1465 if all_required: 

1466 exists = dataset_existence[ref] and exists 

1467 else: 

1468 exists = dataset_existence[ref] or exists 

1469 dataset_existence[ref] = exists 

1470 

1471 if artifact_existence is not None: 

1472 artifact_existence.update(uri_existence) 

1473 

1474 return dataset_existence 

1475 

1476 def mexists( 

1477 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1478 ) -> dict[DatasetRef, bool]: 

1479 """Check the existence of multiple datasets at once. 

1480 

1481 Parameters 

1482 ---------- 

1483 refs : iterable of `DatasetRef` 

1484 The datasets to be checked. 

1485 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1486 Optional mapping of datastore artifact to existence. Updated by 

1487 this method with details of all artifacts tested. Can be `None` 

1488 if the caller is not interested. 

1489 

1490 Returns 

1491 ------- 

1492 existence : `dict` of [`DatasetRef`, `bool`] 

1493 Mapping from dataset to boolean indicating existence. 

1494 

1495 Notes 

1496 ----- 

1497 To minimize potentially costly remote existence checks, the local 

1498 cache is checked as a proxy for existence. If a file for this 

1499 `DatasetRef` does exist no check is done for the actual URI. This 

1500 could result in possibly unexpected behavior if the dataset itself 

1501 has been removed from the datastore by another process whilst it is 

1502 still in the cache. 

1503 """ 

1504 chunk_size = 10_000 

1505 dataset_existence: dict[DatasetRef, bool] = {} 

1506 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1507 n_found_total = 0 

1508 n_checked = 0 

1509 n_chunks = 0 

1510 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1511 chunk_result = self._mexists(chunk, artifact_existence) 

1512 

1513 # The log message level and content depend on how many 

1514 # datasets we are processing. 

1515 n_results = len(chunk_result) 

1516 

1517 # Use verbose logging to ensure that messages can be seen 

1518 # easily if many refs are being checked. 

1519 log_threshold = VERBOSE 

1520 n_checked += n_results 

1521 

1522 # This sum can take some time so only do it if we know the 

1523 # result is going to be used. 

1524 n_found = 0 

1525 if log.isEnabledFor(log_threshold): 

1526 # Can treat the booleans as 0, 1 integers and sum them. 

1527 n_found = sum(chunk_result.values()) 

1528 n_found_total += n_found 

1529 

1530 # We are deliberately not trying to count the number of refs 

1531 # provided in case it's in the millions. This means there is a 

1532 # situation where the number of refs exactly matches the chunk 

1533 # size and we will switch to the multi-chunk path even though 

1534 # we only have a single chunk. 

1535 if n_results < chunk_size and n_chunks == 0: 

1536 # Single chunk will be processed so we can provide more detail. 

1537 if n_results == 1: 

1538 ref = list(chunk_result)[0] 

1539 # Use debug logging to be consistent with `exists()`. 

1540 log.debug( 

1541 "Calling mexists() with single ref that does%s exist (%s).", 

1542 "" if chunk_result[ref] else " not", 

1543 ref, 

1544 ) 

1545 else: 

1546 # Single chunk but multiple files. Summarize. 

1547 log.log( 

1548 log_threshold, 

1549 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1550 n_found, 

1551 n_checked, 

1552 ) 

1553 

1554 else: 

1555 # Use incremental verbose logging when we have multiple chunks. 

1556 log.log( 

1557 log_threshold, 

1558 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1559 "(running total from all chunks so far: %d found out of %d checked)", 

1560 n_chunks, 

1561 n_found, 

1562 n_results, 

1563 n_found_total, 

1564 n_checked, 

1565 ) 

1566 dataset_existence.update(chunk_result) 

1567 n_chunks += 1 

1568 

1569 return dataset_existence 

1570 

1571 def _mexists( 

1572 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1573 ) -> dict[DatasetRef, bool]: 

1574 """Check the existence of multiple datasets at once. 

1575 

1576 Parameters 

1577 ---------- 

1578 refs : iterable of `DatasetRef` 

1579 The datasets to be checked. 

1580 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1581 Optional mapping of datastore artifact to existence. Updated by 

1582 this method with details of all artifacts tested. Can be `None` 

1583 if the caller is not interested. 

1584 

1585 Returns 

1586 ------- 

1587 existence : `dict` of [`DatasetRef`, `bool`] 

1588 Mapping from dataset to boolean indicating existence. 

1589 """ 

1590 # Make a mapping from refs with the internal storage class to the given 

1591 # refs that may have a different one. We'll use the internal refs 

1592 # throughout this method and convert back at the very end. 

1593 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1594 

1595 # Need a mapping of dataset_id to (internal) dataset ref since some 

1596 # internal APIs work with dataset_id. 

1597 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1598 

1599 # Set of all IDs we are checking for. 

1600 requested_ids = set(id_to_ref.keys()) 

1601 

1602 # The records themselves. Could be missing some entries. 

1603 records = self._get_stored_records_associated_with_refs(id_to_ref.values()) 

1604 

1605 dataset_existence = self._process_mexists_records( 

1606 id_to_ref, records, True, artifact_existence=artifact_existence 

1607 ) 

1608 

1609 # Set of IDs that have been handled. 

1610 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1611 

1612 missing_ids = requested_ids - handled_ids 

1613 if missing_ids: 

1614 dataset_existence.update( 

1615 self._mexists_check_expected( 

1616 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1617 ) 

1618 ) 

1619 

1620 return { 

1621 internal_ref_to_input_ref[internal_ref]: existence 

1622 for internal_ref, existence in dataset_existence.items() 

1623 } 

1624 

1625 def _mexists_check_expected( 

1626 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1627 ) -> dict[DatasetRef, bool]: 

1628 """Check existence of refs that are not known to datastore. 

1629 

1630 Parameters 

1631 ---------- 

1632 refs : iterable of `DatasetRef` 

1633 The datasets to be checked. These are assumed not to be known 

1634 to datastore. 

1635 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1636 Optional mapping of datastore artifact to existence. Updated by 

1637 this method with details of all artifacts tested. Can be `None` 

1638 if the caller is not interested. 

1639 

1640 Returns 

1641 ------- 

1642 existence : `dict` of [`DatasetRef`, `bool`] 

1643 Mapping from dataset to boolean indicating existence. 

1644 """ 

1645 dataset_existence: dict[DatasetRef, bool] = {} 

1646 if not self.trustGetRequest: 

1647 # Must assume these do not exist 

1648 for ref in refs: 

1649 dataset_existence[ref] = False 

1650 else: 

1651 log.debug( 

1652 "%d datasets were not known to datastore during initial existence check.", 

1653 len(refs), 

1654 ) 

1655 

1656 # Construct data structure identical to that returned 

1657 # by _get_stored_records_associated_with_refs() but using 

1658 # guessed names. 

1659 records = {} 

1660 id_to_ref = {} 

1661 for missing_ref in refs: 

1662 expected = self._get_expected_dataset_locations_info(missing_ref) 

1663 dataset_id = missing_ref.id 

1664 records[dataset_id] = [info for _, info in expected] 

1665 id_to_ref[dataset_id] = missing_ref 

1666 

1667 dataset_existence.update( 

1668 self._process_mexists_records( 

1669 id_to_ref, 

1670 records, 

1671 False, 

1672 artifact_existence=artifact_existence, 

1673 ) 

1674 ) 

1675 

1676 return dataset_existence 

1677 

1678 def exists(self, ref: DatasetRef) -> bool: 

1679 """Check if the dataset exists in the datastore. 

1680 

1681 Parameters 

1682 ---------- 

1683 ref : `DatasetRef` 

1684 Reference to the required dataset. 

1685 

1686 Returns 

1687 ------- 

1688 exists : `bool` 

1689 `True` if the entity exists in the `Datastore`. 

1690 

1691 Notes 

1692 ----- 

1693 The local cache is checked as a proxy for existence in the remote 

1694 object store. It is possible that another process on a different 

1695 compute node could remove the file from the object store even 

1696 though it is present in the local cache. 

1697 """ 

1698 ref = self._cast_storage_class(ref) 

1699 fileLocations = self._get_dataset_locations_info(ref) 

1700 

1701 # if we are being asked to trust that registry might not be correct 

1702 # we ask for the expected locations and check them explicitly 

1703 if not fileLocations: 

1704 if not self.trustGetRequest: 

1705 return False 

1706 

1707 # First check the cache. If it is not found we must check 

1708 # the datastore itself. Assume that any component in the cache 

1709 # means that the dataset does exist somewhere. 

1710 if self.cacheManager.known_to_cache(ref): 

1711 return True 

1712 

1713 # When we are guessing a dataset location we can not check 

1714 # for the existence of every component since we can not 

1715 # know if every component was written. Instead we check 

1716 # for the existence of any of the expected locations. 

1717 for location, _ in self._get_expected_dataset_locations_info(ref): 

1718 if self._artifact_exists(location): 

1719 return True 

1720 return False 

1721 

1722 # All listed artifacts must exist. 

1723 for location, storedFileInfo in fileLocations: 

1724 # Checking in cache needs the component ref. 

1725 check_ref = ref 

1726 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1727 check_ref = ref.makeComponentRef(component) 

1728 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1729 continue 

1730 

1731 if not self._artifact_exists(location): 

1732 return False 

1733 

1734 return True 

1735 

1736 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1737 """Return URIs associated with dataset. 

1738 

1739 Parameters 

1740 ---------- 

1741 ref : `DatasetRef` 

1742 Reference to the required dataset. 

1743 predict : `bool`, optional 

1744 If the datastore does not know about the dataset, should it 

1745 return a predicted URI or not? 

1746 

1747 Returns 

1748 ------- 

1749 uris : `DatasetRefURIs` 

1750 The URI to the primary artifact associated with this dataset (if 

1751 the dataset was disassembled within the datastore this may be 

1752 `None`), and the URIs to any components associated with the dataset 

1753 artifact. (can be empty if there are no components). 

1754 """ 

1755 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1756 return many[ref] 

1757 

1758 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1759 """URI to the Dataset. 

1760 

1761 Parameters 

1762 ---------- 

1763 ref : `DatasetRef` 

1764 Reference to the required Dataset. 

1765 predict : `bool` 

1766 If `True`, allow URIs to be returned of datasets that have not 

1767 been written. 

1768 

1769 Returns 

1770 ------- 

1771 uri : `str` 

1772 URI pointing to the dataset within the datastore. If the 

1773 dataset does not exist in the datastore, and if ``predict`` is 

1774 `True`, the URI will be a prediction and will include a URI 

1775 fragment "#predicted". 

1776 If the datastore does not have entities that relate well 

1777 to the concept of a URI the returned URI will be 

1778 descriptive. The returned URI is not guaranteed to be obtainable. 

1779 

1780 Raises 

1781 ------ 

1782 FileNotFoundError 

1783 Raised if a URI has been requested for a dataset that does not 

1784 exist and guessing is not allowed. 

1785 RuntimeError 

1786 Raised if a request is made for a single URI but multiple URIs 

1787 are associated with this dataset. 

1788 

1789 Notes 

1790 ----- 

1791 When a predicted URI is requested an attempt will be made to form 

1792 a reasonable URI based on file templates and the expected formatter. 

1793 """ 

1794 primary, components = self.getURIs(ref, predict) 

1795 if primary is None or components: 

1796 raise RuntimeError( 

1797 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1798 ) 

1799 return primary 

1800 

1801 def _predict_URIs( 

1802 self, 

1803 ref: DatasetRef, 

1804 ) -> DatasetRefURIs: 

1805 """Predict the URIs of a dataset ref. 

1806 

1807 Parameters 

1808 ---------- 

1809 ref : `DatasetRef` 

1810 Reference to the required Dataset. 

1811 

1812 Returns 

1813 ------- 

1814 URI : DatasetRefUris 

1815 Primary and component URIs. URIs will contain a URI fragment 

1816 "#predicted". 

1817 """ 

1818 uris = DatasetRefURIs() 

1819 

1820 if self.composites.shouldBeDisassembled(ref): 

1821 for component, _ in ref.datasetType.storageClass.components.items(): 

1822 comp_ref = ref.makeComponentRef(component) 

1823 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1824 

1825 # Add the "#predicted" URI fragment to indicate this is a 

1826 # guess 

1827 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1828 

1829 else: 

1830 location, _ = self._determine_put_formatter_location(ref) 

1831 

1832 # Add the "#predicted" URI fragment to indicate this is a guess 

1833 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1834 

1835 return uris 

1836 

1837 def getManyURIs( 

1838 self, 

1839 refs: Iterable[DatasetRef], 

1840 predict: bool = False, 

1841 allow_missing: bool = False, 

1842 ) -> dict[DatasetRef, DatasetRefURIs]: 

1843 # Docstring inherited 

1844 

1845 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1846 

1847 records = self._get_stored_records_associated_with_refs(refs) 

1848 records_keys = records.keys() 

1849 

1850 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1851 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1852 

1853 # Have to handle trustGetRequest mode by checking for the existence 

1854 # of the missing refs on disk. 

1855 if missing_refs: 

1856 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1857 really_missing = set() 

1858 not_missing = set() 

1859 for ref, exists in dataset_existence.items(): 

1860 if exists: 

1861 not_missing.add(ref) 

1862 else: 

1863 really_missing.add(ref) 

1864 

1865 if not_missing: 

1866 # Need to recalculate the missing/existing split. 

1867 existing_refs = existing_refs + tuple(not_missing) 

1868 missing_refs = tuple(really_missing) 

1869 

1870 for ref in missing_refs: 

1871 # if this has never been written then we have to guess 

1872 if not predict: 

1873 if not allow_missing: 

1874 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1875 else: 

1876 uris[ref] = self._predict_URIs(ref) 

1877 

1878 for ref in existing_refs: 

1879 file_infos = records[ref.id] 

1880 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1881 uris[ref] = self._locations_to_URI(ref, file_locations) 

1882 

1883 return uris 

1884 

1885 def _locations_to_URI( 

1886 self, 

1887 ref: DatasetRef, 

1888 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1889 ) -> DatasetRefURIs: 

1890 """Convert one or more file locations associated with a DatasetRef 

1891 to a DatasetRefURIs. 

1892 

1893 Parameters 

1894 ---------- 

1895 ref : `DatasetRef` 

1896 Reference to the dataset. 

1897 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1898 Each item in the sequence is the location of the dataset within the 

1899 datastore and stored information about the file and its formatter. 

1900 If there is only one item in the sequence then it is treated as the 

1901 primary URI. If there is more than one item then they are treated 

1902 as component URIs. If there are no items then an error is raised 

1903 unless ``self.trustGetRequest`` is `True`. 

1904 

1905 Returns 

1906 ------- 

1907 uris: DatasetRefURIs 

1908 Represents the primary URI or component URIs described by the 

1909 inputs. 

1910 

1911 Raises 

1912 ------ 

1913 RuntimeError 

1914 If no file locations are passed in and ``self.trustGetRequest`` is 

1915 `False`. 

1916 FileNotFoundError 

1917 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1918 is `False`. 

1919 RuntimeError 

1920 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1921 unexpected). 

1922 """ 

1923 

1924 guessing = False 

1925 uris = DatasetRefURIs() 

1926 

1927 if not file_locations: 

1928 if not self.trustGetRequest: 

1929 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1930 file_locations = self._get_expected_dataset_locations_info(ref) 

1931 guessing = True 

1932 

1933 if len(file_locations) == 1: 

1934 # No disassembly so this is the primary URI 

1935 uris.primaryURI = file_locations[0][0].uri 

1936 if guessing and not uris.primaryURI.exists(): 

1937 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1938 else: 

1939 for location, file_info in file_locations: 

1940 if file_info.component is None: 

1941 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1942 if guessing and not location.uri.exists(): 

1943 # If we are trusting then it is entirely possible for 

1944 # some components to be missing. In that case we skip 

1945 # to the next component. 

1946 if self.trustGetRequest: 

1947 continue 

1948 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1949 uris.componentURIs[file_info.component] = location.uri 

1950 

1951 return uris 

1952 

1953 def retrieveArtifacts( 

1954 self, 

1955 refs: Iterable[DatasetRef], 

1956 destination: ResourcePath, 

1957 transfer: str = "auto", 

1958 preserve_path: bool = True, 

1959 overwrite: bool = False, 

1960 ) -> list[ResourcePath]: 

1961 """Retrieve the file artifacts associated with the supplied refs. 

1962 

1963 Parameters 

1964 ---------- 

1965 refs : iterable of `DatasetRef` 

1966 The datasets for which file artifacts are to be retrieved. 

1967 A single ref can result in multiple files. The refs must 

1968 be resolved. 

1969 destination : `lsst.resources.ResourcePath` 

1970 Location to write the file artifacts. 

1971 transfer : `str`, optional 

1972 Method to use to transfer the artifacts. Must be one of the options 

1973 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1974 "move" is not allowed. 

1975 preserve_path : `bool`, optional 

1976 If `True` the full path of the file artifact within the datastore 

1977 is preserved. If `False` the final file component of the path 

1978 is used. 

1979 overwrite : `bool`, optional 

1980 If `True` allow transfers to overwrite existing files at the 

1981 destination. 

1982 

1983 Returns 

1984 ------- 

1985 targets : `list` of `lsst.resources.ResourcePath` 

1986 URIs of file artifacts in destination location. Order is not 

1987 preserved. 

1988 """ 

1989 if not destination.isdir(): 

1990 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1991 

1992 if transfer == "move": 

1993 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1994 

1995 # Source -> Destination 

1996 # This also helps filter out duplicate DatasetRef in the request 

1997 # that will map to the same underlying file transfer. 

1998 to_transfer: dict[ResourcePath, ResourcePath] = {} 

1999 

2000 for ref in refs: 

2001 locations = self._get_dataset_locations_info(ref) 

2002 for location, _ in locations: 

2003 source_uri = location.uri 

2004 target_path: ResourcePathExpression 

2005 if preserve_path: 

2006 target_path = location.pathInStore 

2007 if target_path.isabs(): 

2008 # This is an absolute path to an external file. 

2009 # Use the full path. 

2010 target_path = target_path.relativeToPathRoot 

2011 else: 

2012 target_path = source_uri.basename() 

2013 target_uri = destination.join(target_path) 

2014 to_transfer[source_uri] = target_uri 

2015 

2016 # In theory can now parallelize the transfer 

2017 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

2018 for source_uri, target_uri in to_transfer.items(): 

2019 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

2020 

2021 return list(to_transfer.values()) 

2022 

2023 def get( 

2024 self, 

2025 ref: DatasetRef, 

2026 parameters: Mapping[str, Any] | None = None, 

2027 storageClass: StorageClass | str | None = None, 

2028 ) -> Any: 

2029 """Load an InMemoryDataset from the store. 

2030 

2031 Parameters 

2032 ---------- 

2033 ref : `DatasetRef` 

2034 Reference to the required Dataset. 

2035 parameters : `dict` 

2036 `StorageClass`-specific parameters that specify, for example, 

2037 a slice of the dataset to be loaded. 

2038 storageClass : `StorageClass` or `str`, optional 

2039 The storage class to be used to override the Python type 

2040 returned by this method. By default the returned type matches 

2041 the dataset type definition for this dataset. Specifying a 

2042 read `StorageClass` can force a different type to be returned. 

2043 This type must be compatible with the original type. 

2044 

2045 Returns 

2046 ------- 

2047 inMemoryDataset : `object` 

2048 Requested dataset or slice thereof as an InMemoryDataset. 

2049 

2050 Raises 

2051 ------ 

2052 FileNotFoundError 

2053 Requested dataset can not be retrieved. 

2054 TypeError 

2055 Return value from formatter has unexpected type. 

2056 ValueError 

2057 Formatter failed to process the dataset. 

2058 """ 

2059 # Supplied storage class for the component being read is either 

2060 # from the ref itself or some an override if we want to force 

2061 # type conversion. 

2062 if storageClass is not None: 

2063 ref = ref.overrideStorageClass(storageClass) 

2064 refStorageClass = ref.datasetType.storageClass 

2065 

2066 allGetInfo = self._prepare_for_get(ref, parameters) 

2067 refComponent = ref.datasetType.component() 

2068 

2069 # Create mapping from component name to related info 

2070 allComponents = {i.component: i for i in allGetInfo} 

2071 

2072 # By definition the dataset is disassembled if we have more 

2073 # than one record for it. 

2074 isDisassembled = len(allGetInfo) > 1 

2075 

2076 # Look for the special case where we are disassembled but the 

2077 # component is a derived component that was not written during 

2078 # disassembly. For this scenario we need to check that the 

2079 # component requested is listed as a derived component for the 

2080 # composite storage class 

2081 isDisassembledReadOnlyComponent = False 

2082 if isDisassembled and refComponent: 

2083 # The composite storage class should be accessible through 

2084 # the component dataset type 

2085 compositeStorageClass = ref.datasetType.parentStorageClass 

2086 

2087 # In the unlikely scenario where the composite storage 

2088 # class is not known, we can only assume that this is a 

2089 # normal component. If that assumption is wrong then the 

2090 # branch below that reads a persisted component will fail 

2091 # so there is no need to complain here. 

2092 if compositeStorageClass is not None: 

2093 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2094 

2095 if isDisassembled and not refComponent: 

2096 # This was a disassembled dataset spread over multiple files 

2097 # and we need to put them all back together again. 

2098 # Read into memory and then assemble 

2099 

2100 # Check that the supplied parameters are suitable for the type read 

2101 refStorageClass.validateParameters(parameters) 

2102 

2103 # We want to keep track of all the parameters that were not used 

2104 # by formatters. We assume that if any of the component formatters 

2105 # use a parameter that we do not need to apply it again in the 

2106 # assembler. 

2107 usedParams = set() 

2108 

2109 components: dict[str, Any] = {} 

2110 for getInfo in allGetInfo: 

2111 # assemblerParams are parameters not understood by the 

2112 # associated formatter. 

2113 usedParams.update(set(getInfo.formatterParams)) 

2114 

2115 component = getInfo.component 

2116 

2117 if component is None: 

2118 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2119 

2120 # We do not want the formatter to think it's reading 

2121 # a component though because it is really reading a 

2122 # standalone dataset -- always tell reader it is not a 

2123 # component. 

2124 components[component] = self._read_artifact_into_memory( 

2125 getInfo, ref.makeComponentRef(component), isComponent=False 

2126 ) 

2127 

2128 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2129 

2130 # Any unused parameters will have to be passed to the assembler 

2131 if parameters: 

2132 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2133 else: 

2134 unusedParams = {} 

2135 

2136 # Process parameters 

2137 return ref.datasetType.storageClass.delegate().handleParameters( 

2138 inMemoryDataset, parameters=unusedParams 

2139 ) 

2140 

2141 elif isDisassembledReadOnlyComponent: 

2142 compositeStorageClass = ref.datasetType.parentStorageClass 

2143 if compositeStorageClass is None: 

2144 raise RuntimeError( 

2145 f"Unable to retrieve derived component '{refComponent}' since" 

2146 "no composite storage class is available." 

2147 ) 

2148 

2149 if refComponent is None: 

2150 # Mainly for mypy 

2151 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2152 

2153 # Assume that every derived component can be calculated by 

2154 # forwarding the request to a single read/write component. 

2155 # Rather than guessing which rw component is the right one by 

2156 # scanning each for a derived component of the same name, 

2157 # we ask the storage class delegate directly which one is best to 

2158 # use. 

2159 compositeDelegate = compositeStorageClass.delegate() 

2160 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2161 refComponent, set(allComponents) 

2162 ) 

2163 

2164 # Select the relevant component 

2165 rwInfo = allComponents[forwardedComponent] 

2166 

2167 # For now assume that read parameters are validated against 

2168 # the real component and not the requested component 

2169 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2170 forwardedStorageClass.validateParameters(parameters) 

2171 

2172 # The reference to use for the caching must refer to the forwarded 

2173 # component and not the derived component. 

2174 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2175 

2176 # Unfortunately the FileDescriptor inside the formatter will have 

2177 # the wrong write storage class so we need to create a new one 

2178 # given the immutability constraint. 

2179 writeStorageClass = rwInfo.info.storageClass 

2180 

2181 # We may need to put some thought into parameters for read 

2182 # components but for now forward them on as is 

2183 readFormatter = type(rwInfo.formatter)( 

2184 FileDescriptor( 

2185 rwInfo.location, 

2186 readStorageClass=refStorageClass, 

2187 storageClass=writeStorageClass, 

2188 parameters=parameters, 

2189 ), 

2190 ref.dataId, 

2191 ) 

2192 

2193 # The assembler can not receive any parameter requests for a 

2194 # derived component at this time since the assembler will 

2195 # see the storage class of the derived component and those 

2196 # parameters will have to be handled by the formatter on the 

2197 # forwarded storage class. 

2198 assemblerParams: dict[str, Any] = {} 

2199 

2200 # Need to created a new info that specifies the derived 

2201 # component and associated storage class 

2202 readInfo = DatastoreFileGetInformation( 

2203 rwInfo.location, 

2204 readFormatter, 

2205 rwInfo.info, 

2206 assemblerParams, 

2207 {}, 

2208 refComponent, 

2209 refStorageClass, 

2210 ) 

2211 

2212 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2213 

2214 else: 

2215 # Single file request or component from that composite file 

2216 for lookup in (refComponent, None): 

2217 if lookup in allComponents: 

2218 getInfo = allComponents[lookup] 

2219 break 

2220 else: 

2221 raise FileNotFoundError( 

2222 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2223 ) 

2224 

2225 # Do not need the component itself if already disassembled 

2226 if isDisassembled: 

2227 isComponent = False 

2228 else: 

2229 isComponent = getInfo.component is not None 

2230 

2231 # For a component read of a composite we want the cache to 

2232 # be looking at the composite ref itself. 

2233 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2234 

2235 # For a disassembled component we can validate parametersagainst 

2236 # the component storage class directly 

2237 if isDisassembled: 

2238 refStorageClass.validateParameters(parameters) 

2239 else: 

2240 # For an assembled composite this could be a derived 

2241 # component derived from a real component. The validity 

2242 # of the parameters is not clear. For now validate against 

2243 # the composite storage class 

2244 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2245 

2246 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2247 

2248 @transactional 

2249 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2250 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2251 

2252 Parameters 

2253 ---------- 

2254 inMemoryDataset : `object` 

2255 The dataset to store. 

2256 ref : `DatasetRef` 

2257 Reference to the associated Dataset. 

2258 

2259 Raises 

2260 ------ 

2261 TypeError 

2262 Supplied object and storage class are inconsistent. 

2263 DatasetTypeNotSupportedError 

2264 The associated `DatasetType` is not handled by this datastore. 

2265 

2266 Notes 

2267 ----- 

2268 If the datastore is configured to reject certain dataset types it 

2269 is possible that the put will fail and raise a 

2270 `DatasetTypeNotSupportedError`. The main use case for this is to 

2271 allow `ChainedDatastore` to put to multiple datastores without 

2272 requiring that every datastore accepts the dataset. 

2273 """ 

2274 

2275 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2276 # doDisassembly = True 

2277 

2278 artifacts = [] 

2279 if doDisassembly: 

2280 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2281 if components is None: 

2282 raise RuntimeError( 

2283 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2284 f"with storage class {ref.datasetType.storageClass.name} " 

2285 "is configured to be disassembled, but cannot be." 

2286 ) 

2287 for component, componentInfo in components.items(): 

2288 # Don't recurse because we want to take advantage of 

2289 # bulk insert -- need a new DatasetRef that refers to the 

2290 # same dataset_id but has the component DatasetType 

2291 # DatasetType does not refer to the types of components 

2292 # So we construct one ourselves. 

2293 compRef = ref.makeComponentRef(component) 

2294 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2295 artifacts.append((compRef, storedInfo)) 

2296 else: 

2297 # Write the entire thing out 

2298 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2299 artifacts.append((ref, storedInfo)) 

2300 

2301 self._register_datasets(artifacts) 

2302 

2303 @transactional 

2304 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2305 # At this point can safely remove these datasets from the cache 

2306 # to avoid confusion later on. If they are not trashed later 

2307 # the cache will simply be refilled. 

2308 self.cacheManager.remove_from_cache(ref) 

2309 

2310 # If we are in trust mode there will be nothing to move to 

2311 # the trash table and we will have to try to delete the file 

2312 # immediately. 

2313 if self.trustGetRequest: 

2314 # Try to keep the logic below for a single file trash. 

2315 if isinstance(ref, DatasetRef): 

2316 refs = {ref} 

2317 else: 

2318 # Will recreate ref at the end of this branch. 

2319 refs = set(ref) 

2320 

2321 # Determine which datasets are known to datastore directly. 

2322 id_to_ref = {ref.id: ref for ref in refs} 

2323 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2324 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2325 

2326 missing = refs - existing_refs 

2327 if missing: 

2328 # Do an explicit existence check on these refs. 

2329 # We only care about the artifacts at this point and not 

2330 # the dataset existence. 

2331 artifact_existence: dict[ResourcePath, bool] = {} 

2332 _ = self.mexists(missing, artifact_existence) 

2333 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2334 

2335 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2336 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2337 for uri in uris: 

2338 try: 

2339 uri.remove() 

2340 except Exception as e: 

2341 if ignore_errors: 

2342 log.debug("Artifact %s could not be removed: %s", uri, e) 

2343 continue 

2344 raise 

2345 

2346 # There is no point asking the code below to remove refs we 

2347 # know are missing so update it with the list of existing 

2348 # records. Try to retain one vs many logic. 

2349 if not existing_refs: 

2350 # Nothing more to do since none of the datasets were 

2351 # known to the datastore record table. 

2352 return 

2353 ref = list(existing_refs) 

2354 if len(ref) == 1: 

2355 ref = ref[0] 

2356 

2357 # Get file metadata and internal metadata 

2358 if not isinstance(ref, DatasetRef): 

2359 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2360 # Assumed to be an iterable of refs so bulk mode enabled. 

2361 try: 

2362 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2363 except Exception as e: 

2364 if ignore_errors: 

2365 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2366 else: 

2367 raise 

2368 return 

2369 

2370 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2371 

2372 fileLocations = self._get_dataset_locations_info(ref) 

2373 

2374 if not fileLocations: 

2375 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2376 if ignore_errors: 

2377 log.warning(err_msg) 

2378 return 

2379 else: 

2380 raise FileNotFoundError(err_msg) 

2381 

2382 for location, storedFileInfo in fileLocations: 

2383 if not self._artifact_exists(location): 

2384 err_msg = ( 

2385 f"Dataset is known to datastore {self.name} but " 

2386 f"associated artifact ({location.uri}) is missing" 

2387 ) 

2388 if ignore_errors: 

2389 log.warning(err_msg) 

2390 return 

2391 else: 

2392 raise FileNotFoundError(err_msg) 

2393 

2394 # Mark dataset as trashed 

2395 try: 

2396 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2397 except Exception as e: 

2398 if ignore_errors: 

2399 log.warning( 

2400 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2401 "but encountered an error: %s", 

2402 ref, 

2403 self.name, 

2404 e, 

2405 ) 

2406 pass 

2407 else: 

2408 raise 

2409 

2410 @transactional 

2411 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2412 """Remove all datasets from the trash. 

2413 

2414 Parameters 

2415 ---------- 

2416 ignore_errors : `bool` 

2417 If `True` return without error even if something went wrong. 

2418 Problems could occur if another process is simultaneously trying 

2419 to delete. 

2420 """ 

2421 log.debug("Emptying trash in datastore %s", self.name) 

2422 

2423 # Context manager will empty trash iff we finish it without raising. 

2424 # It will also automatically delete the relevant rows from the 

2425 # trash table and the records table. 

2426 with self.bridge.emptyTrash( 

2427 self._table, record_class=StoredFileInfo, record_column="path" 

2428 ) as trash_data: 

2429 # Removing the artifacts themselves requires that the files are 

2430 # not also associated with refs that are not to be trashed. 

2431 # Therefore need to do a query with the file paths themselves 

2432 # and return all the refs associated with them. Can only delete 

2433 # a file if the refs to be trashed are the only refs associated 

2434 # with the file. 

2435 # This requires multiple copies of the trashed items 

2436 trashed, artifacts_to_keep = trash_data 

2437 

2438 if artifacts_to_keep is None: 

2439 # The bridge is not helping us so have to work it out 

2440 # ourselves. This is not going to be as efficient. 

2441 trashed = list(trashed) 

2442 

2443 # The instance check is for mypy since up to this point it 

2444 # does not know the type of info. 

2445 path_map = self._refs_associated_with_artifacts( 

2446 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2447 ) 

2448 

2449 for ref, info in trashed: 

2450 # Mypy needs to know this is not the base class 

2451 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2452 

2453 path_map[info.path].remove(ref.id) 

2454 if not path_map[info.path]: 

2455 del path_map[info.path] 

2456 

2457 artifacts_to_keep = set(path_map) 

2458 

2459 for ref, info in trashed: 

2460 # Should not happen for this implementation but need 

2461 # to keep mypy happy. 

2462 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2463 

2464 # Mypy needs to know this is not the base class 

2465 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2466 

2467 if info.path in artifacts_to_keep: 

2468 # This is a multi-dataset artifact and we are not 

2469 # removing all associated refs. 

2470 continue 

2471 

2472 # Only trashed refs still known to datastore will be returned. 

2473 location = info.file_location(self.locationFactory) 

2474 

2475 # Point of no return for this artifact 

2476 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2477 try: 

2478 self._delete_artifact(location) 

2479 except FileNotFoundError: 

2480 # If the file itself has been deleted there is nothing 

2481 # we can do about it. It is possible that trash has 

2482 # been run in parallel in another process or someone 

2483 # decided to delete the file. It is unlikely to come 

2484 # back and so we should still continue with the removal 

2485 # of the entry from the trash table. It is also possible 

2486 # we removed it in a previous iteration if it was 

2487 # a multi-dataset artifact. The delete artifact method 

2488 # will log a debug message in this scenario. 

2489 # Distinguishing file missing before trash started and 

2490 # file already removed previously as part of this trash 

2491 # is not worth the distinction with regards to potential 

2492 # memory cost. 

2493 pass 

2494 except Exception as e: 

2495 if ignore_errors: 

2496 # Use a debug message here even though it's not 

2497 # a good situation. In some cases this can be 

2498 # caused by a race between user A and user B 

2499 # and neither of them has permissions for the 

2500 # other's files. Butler does not know about users 

2501 # and trash has no idea what collections these 

2502 # files were in (without guessing from a path). 

2503 log.debug( 

2504 "Encountered error removing artifact %s from datastore %s: %s", 

2505 location.uri, 

2506 self.name, 

2507 e, 

2508 ) 

2509 else: 

2510 raise 

2511 

2512 @transactional 

2513 def transfer_from( 

2514 self, 

2515 source_datastore: Datastore, 

2516 refs: Iterable[DatasetRef], 

2517 transfer: str = "auto", 

2518 artifact_existence: dict[ResourcePath, bool] | None = None, 

2519 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2520 # Docstring inherited 

2521 if type(self) is not type(source_datastore): 

2522 raise TypeError( 

2523 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2524 f"source datastore ({type(source_datastore)})." 

2525 ) 

2526 

2527 # Be explicit for mypy 

2528 if not isinstance(source_datastore, FileDatastore): 

2529 raise TypeError( 

2530 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2531 f" {type(source_datastore)}" 

2532 ) 

2533 

2534 # Stop early if "direct" transfer mode is requested. That would 

2535 # require that the URI inside the source datastore should be stored 

2536 # directly in the target datastore, which seems unlikely to be useful 

2537 # since at any moment the source datastore could delete the file. 

2538 if transfer in ("direct", "split"): 

2539 raise ValueError( 

2540 f"Can not transfer from a source datastore using {transfer} mode since" 

2541 " those files are controlled by the other datastore." 

2542 ) 

2543 

2544 # Empty existence lookup if none given. 

2545 if artifact_existence is None: 

2546 artifact_existence = {} 

2547 

2548 # We will go through the list multiple times so must convert 

2549 # generators to lists. 

2550 refs = list(refs) 

2551 

2552 # In order to handle disassembled composites the code works 

2553 # at the records level since it can assume that internal APIs 

2554 # can be used. 

2555 # - If the record already exists in the destination this is assumed 

2556 # to be okay. 

2557 # - If there is no record but the source and destination URIs are 

2558 # identical no transfer is done but the record is added. 

2559 # - If the source record refers to an absolute URI currently assume 

2560 # that that URI should remain absolute and will be visible to the 

2561 # destination butler. May need to have a flag to indicate whether 

2562 # the dataset should be transferred. This will only happen if 

2563 # the detached Butler has had a local ingest. 

2564 

2565 # What we really want is all the records in the source datastore 

2566 # associated with these refs. Or derived ones if they don't exist 

2567 # in the source. 

2568 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2569 

2570 # The source dataset_ids are the keys in these records 

2571 source_ids = set(source_records) 

2572 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2573 

2574 requested_ids = {ref.id for ref in refs} 

2575 missing_ids = requested_ids - source_ids 

2576 

2577 # Missing IDs can be okay if that datastore has allowed 

2578 # gets based on file existence. Should we transfer what we can 

2579 # or complain about it and warn? 

2580 if missing_ids and not source_datastore.trustGetRequest: 

2581 raise ValueError( 

2582 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2583 ) 

2584 

2585 # Need to map these missing IDs to a DatasetRef so we can guess 

2586 # the details. 

2587 if missing_ids: 

2588 log.info( 

2589 "Number of expected datasets missing from source datastore records: %d out of %d", 

2590 len(missing_ids), 

2591 len(requested_ids), 

2592 ) 

2593 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2594 

2595 # This should be chunked in case we end up having to check 

2596 # the file store since we need some log output to show 

2597 # progress. 

2598 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2599 records = {} 

2600 for missing in missing_ids_chunk: 

2601 # Ask the source datastore where the missing artifacts 

2602 # should be. An execution butler might not know about the 

2603 # artifacts even if they are there. 

2604 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2605 records[missing] = [info for _, info in expected] 

2606 

2607 # Call the mexist helper method in case we have not already 

2608 # checked these artifacts such that artifact_existence is 

2609 # empty. This allows us to benefit from parallelism. 

2610 # datastore.mexists() itself does not give us access to the 

2611 # derived datastore record. 

2612 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2613 ref_exists = source_datastore._process_mexists_records( 

2614 id_to_ref, records, False, artifact_existence=artifact_existence 

2615 ) 

2616 

2617 # Now go through the records and propagate the ones that exist. 

2618 location_factory = source_datastore.locationFactory 

2619 for missing, record_list in records.items(): 

2620 # Skip completely if the ref does not exist. 

2621 ref = id_to_ref[missing] 

2622 if not ref_exists[ref]: 

2623 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2624 continue 

2625 # Check for file artifact to decide which parts of a 

2626 # disassembled composite do exist. If there is only a 

2627 # single record we don't even need to look because it can't 

2628 # be a composite and must exist. 

2629 if len(record_list) == 1: 

2630 dataset_records = record_list 

2631 else: 

2632 dataset_records = [ 

2633 record 

2634 for record in record_list 

2635 if artifact_existence[record.file_location(location_factory).uri] 

2636 ] 

2637 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2638 

2639 # Rely on source_records being a defaultdict. 

2640 source_records[missing].extend(dataset_records) 

2641 

2642 # See if we already have these records 

2643 target_records = self._get_stored_records_associated_with_refs(refs) 

2644 

2645 # The artifacts to register 

2646 artifacts = [] 

2647 

2648 # Refs that already exist 

2649 already_present = [] 

2650 

2651 # Refs that were rejected by this datastore. 

2652 rejected = set() 

2653 

2654 # Refs that were transferred successfully. 

2655 accepted = set() 

2656 

2657 # Record each time we have done a "direct" transfer. 

2658 direct_transfers = [] 

2659 

2660 # Now can transfer the artifacts 

2661 for ref in refs: 

2662 if not self.constraints.isAcceptable(ref): 

2663 # This datastore should not be accepting this dataset. 

2664 rejected.add(ref) 

2665 continue 

2666 

2667 accepted.add(ref) 

2668 

2669 if ref.id in target_records: 

2670 # Already have an artifact for this. 

2671 already_present.append(ref) 

2672 continue 

2673 

2674 # mypy needs to know these are always resolved refs 

2675 for info in source_records[ref.id]: 

2676 source_location = info.file_location(source_datastore.locationFactory) 

2677 target_location = info.file_location(self.locationFactory) 

2678 if source_location == target_location and not source_location.pathInStore.isabs(): 

2679 # Artifact is already in the target location. 

2680 # (which is how execution butler currently runs) 

2681 pass 

2682 else: 

2683 if target_location.pathInStore.isabs(): 

2684 # Just because we can see the artifact when running 

2685 # the transfer doesn't mean it will be generally 

2686 # accessible to a user of this butler. Need to decide 

2687 # what to do about an absolute path. 

2688 if transfer == "auto": 

2689 # For "auto" transfers we allow the absolute URI 

2690 # to be recorded in the target datastore. 

2691 direct_transfers.append(source_location) 

2692 else: 

2693 # The user is explicitly requesting a transfer 

2694 # even for an absolute URI. This requires us to 

2695 # calculate the target path. 

2696 template_ref = ref 

2697 if info.component: 

2698 template_ref = ref.makeComponentRef(info.component) 

2699 target_location = self._calculate_ingested_datastore_name( 

2700 source_location.uri, 

2701 template_ref, 

2702 ) 

2703 

2704 info = info.update(path=target_location.pathInStore.path) 

2705 

2706 # Need to transfer it to the new location. 

2707 # Assume we should always overwrite. If the artifact 

2708 # is there this might indicate that a previous transfer 

2709 # was interrupted but was not able to be rolled back 

2710 # completely (eg pre-emption) so follow Datastore default 

2711 # and overwrite. 

2712 target_location.uri.transfer_from( 

2713 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2714 ) 

2715 

2716 artifacts.append((ref, info)) 

2717 

2718 if direct_transfers: 

2719 log.info( 

2720 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2721 len(direct_transfers), 

2722 "" if len(direct_transfers) == 1 else "s", 

2723 ) 

2724 

2725 self._register_datasets(artifacts) 

2726 

2727 if already_present: 

2728 n_skipped = len(already_present) 

2729 log.info( 

2730 "Skipped transfer of %d dataset%s already present in datastore", 

2731 n_skipped, 

2732 "" if n_skipped == 1 else "s", 

2733 ) 

2734 

2735 return accepted, rejected 

2736 

2737 @transactional 

2738 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2739 # Docstring inherited. 

2740 refs = list(refs) 

2741 self.bridge.forget(refs) 

2742 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2743 

2744 def validateConfiguration( 

2745 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2746 ) -> None: 

2747 """Validate some of the configuration for this datastore. 

2748 

2749 Parameters 

2750 ---------- 

2751 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2752 Entities to test against this configuration. Can be differing 

2753 types. 

2754 logFailures : `bool`, optional 

2755 If `True`, output a log message for every validation error 

2756 detected. 

2757 

2758 Raises 

2759 ------ 

2760 DatastoreValidationError 

2761 Raised if there is a validation problem with a configuration. 

2762 All the problems are reported in a single exception. 

2763 

2764 Notes 

2765 ----- 

2766 This method checks that all the supplied entities have valid file 

2767 templates and also have formatters defined. 

2768 """ 

2769 

2770 templateFailed = None 

2771 try: 

2772 self.templates.validateTemplates(entities, logFailures=logFailures) 

2773 except FileTemplateValidationError as e: 

2774 templateFailed = str(e) 

2775 

2776 formatterFailed = [] 

2777 for entity in entities: 

2778 try: 

2779 self.formatterFactory.getFormatterClass(entity) 

2780 except KeyError as e: 

2781 formatterFailed.append(str(e)) 

2782 if logFailures: 

2783 log.critical("Formatter failure: %s", e) 

2784 

2785 if templateFailed or formatterFailed: 

2786 messages = [] 

2787 if templateFailed: 

2788 messages.append(templateFailed) 

2789 if formatterFailed: 

2790 messages.append(",".join(formatterFailed)) 

2791 msg = ";\n".join(messages) 

2792 raise DatastoreValidationError(msg) 

2793 

2794 def getLookupKeys(self) -> set[LookupKey]: 

2795 # Docstring is inherited from base class 

2796 return ( 

2797 self.templates.getLookupKeys() 

2798 | self.formatterFactory.getLookupKeys() 

2799 | self.constraints.getLookupKeys() 

2800 ) 

2801 

2802 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2803 # Docstring is inherited from base class 

2804 # The key can be valid in either formatters or templates so we can 

2805 # only check the template if it exists 

2806 if lookupKey in self.templates: 

2807 try: 

2808 self.templates[lookupKey].validateTemplate(entity) 

2809 except FileTemplateValidationError as e: 

2810 raise DatastoreValidationError(e) from e 

2811 

2812 def export( 

2813 self, 

2814 refs: Iterable[DatasetRef], 

2815 *, 

2816 directory: ResourcePathExpression | None = None, 

2817 transfer: str | None = "auto", 

2818 ) -> Iterable[FileDataset]: 

2819 # Docstring inherited from Datastore.export. 

2820 if transfer == "auto" and directory is None: 

2821 transfer = None 

2822 

2823 if transfer is not None and directory is None: 

2824 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2825 

2826 if transfer == "move": 

2827 raise TypeError("Can not export by moving files out of datastore.") 

2828 elif transfer == "direct": 

2829 # For an export, treat this as equivalent to None. We do not 

2830 # want an import to risk using absolute URIs to datasets owned 

2831 # by another datastore. 

2832 log.info("Treating 'direct' transfer mode as in-place export.") 

2833 transfer = None 

2834 

2835 # Force the directory to be a URI object 

2836 directoryUri: ResourcePath | None = None 

2837 if directory is not None: 

2838 directoryUri = ResourcePath(directory, forceDirectory=True) 

2839 

2840 if transfer is not None and directoryUri is not None: 

2841 # mypy needs the second test 

2842 if not directoryUri.exists(): 

2843 raise FileNotFoundError(f"Export location {directory} does not exist") 

2844 

2845 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2846 for ref in progress.wrap(refs, "Exporting dataset files"): 

2847 fileLocations = self._get_dataset_locations_info(ref) 

2848 if not fileLocations: 

2849 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2850 # For now we can not export disassembled datasets 

2851 if len(fileLocations) > 1: 

2852 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2853 location, storedFileInfo = fileLocations[0] 

2854 

2855 pathInStore = location.pathInStore.path 

2856 if transfer is None: 

2857 # TODO: do we also need to return the readStorageClass somehow? 

2858 # We will use the path in store directly. If this is an 

2859 # absolute URI, preserve it. 

2860 if location.pathInStore.isabs(): 

2861 pathInStore = str(location.uri) 

2862 elif transfer == "direct": 

2863 # Use full URIs to the remote store in the export 

2864 pathInStore = str(location.uri) 

2865 else: 

2866 # mypy needs help 

2867 assert directoryUri is not None, "directoryUri must be defined to get here" 

2868 storeUri = ResourcePath(location.uri) 

2869 

2870 # if the datastore has an absolute URI to a resource, we 

2871 # have two options: 

2872 # 1. Keep the absolute URI in the exported YAML 

2873 # 2. Allocate a new name in the local datastore and transfer 

2874 # it. 

2875 # For now go with option 2 

2876 if location.pathInStore.isabs(): 

2877 template = self.templates.getTemplate(ref) 

2878 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2879 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2880 

2881 exportUri = directoryUri.join(pathInStore) 

2882 exportUri.transfer_from(storeUri, transfer=transfer) 

2883 

2884 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2885 

2886 @staticmethod 

2887 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2888 """Compute the checksum of the supplied file. 

2889 

2890 Parameters 

2891 ---------- 

2892 uri : `lsst.resources.ResourcePath` 

2893 Name of resource to calculate checksum from. 

2894 algorithm : `str`, optional 

2895 Name of algorithm to use. Must be one of the algorithms supported 

2896 by :py:class`hashlib`. 

2897 block_size : `int` 

2898 Number of bytes to read from file at one time. 

2899 

2900 Returns 

2901 ------- 

2902 hexdigest : `str` 

2903 Hex digest of the file. 

2904 

2905 Notes 

2906 ----- 

2907 Currently returns None if the URI is for a remote resource. 

2908 """ 

2909 if algorithm not in hashlib.algorithms_guaranteed: 

2910 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2911 

2912 if not uri.isLocal: 

2913 return None 

2914 

2915 hasher = hashlib.new(algorithm) 

2916 

2917 with uri.as_local() as local_uri: 

2918 with open(local_uri.ospath, "rb") as f: 

2919 for chunk in iter(lambda: f.read(block_size), b""): 

2920 hasher.update(chunk) 

2921 

2922 return hasher.hexdigest() 

2923 

2924 def needs_expanded_data_ids( 

2925 self, 

2926 transfer: str | None, 

2927 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2928 ) -> bool: 

2929 # Docstring inherited. 

2930 # This _could_ also use entity to inspect whether the filename template 

2931 # involves placeholders other than the required dimensions for its 

2932 # dataset type, but that's not necessary for correctness; it just 

2933 # enables more optimizations (perhaps only in theory). 

2934 return transfer not in ("direct", None) 

2935 

2936 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2937 # Docstring inherited from the base class. 

2938 record_data = data.get(self.name) 

2939 if not record_data: 

2940 return 

2941 

2942 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys()) 

2943 

2944 # TODO: Verify that there are no unexpected table names in the dict? 

2945 unpacked_records = [] 

2946 for dataset_data in record_data.records.values(): 

2947 records = dataset_data.get(self._table.name) 

2948 if records: 

2949 for info in records: 

2950 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2951 unpacked_records.append(info.to_record()) 

2952 if unpacked_records: 

2953 self._table.insert(*unpacked_records, transaction=self._transaction) 

2954 

2955 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2956 # Docstring inherited from the base class. 

2957 exported_refs = list(self._bridge.check(refs)) 

2958 ids = {ref.id for ref in exported_refs} 

2959 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

2960 for row in self._table.fetch(dataset_id=ids): 

2961 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2962 dataset_records = records.setdefault(info.dataset_id, {}) 

2963 dataset_records.setdefault(self._table.name, []).append(info) 

2964 

2965 record_data = DatastoreRecordData(records=records) 

2966 return {self.name: record_data} 

2967 

2968 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2969 # Docstring inherited from the base class. 

2970 self._retrieve_dataset_method = method 

2971 

2972 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2973 """Update dataset reference to use the storage class from registry.""" 

2974 if self._retrieve_dataset_method is None: 

2975 # We could raise an exception here but unit tests do not define 

2976 # this method. 

2977 return ref 

2978 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2979 if dataset_type is not None: 

2980 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2981 return ref