Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 10%

972 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-08-05 01:26 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Generic file-based datastore code.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("FileDatastore",) 

27 

28import contextlib 

29import hashlib 

30import logging 

31from collections import defaultdict 

32from collections.abc import Callable, Iterable, Mapping, Sequence 

33from dataclasses import dataclass 

34from typing import TYPE_CHECKING, Any, ClassVar 

35 

36from lsst.daf.butler import ( 

37 CompositesMap, 

38 Config, 

39 DatasetId, 

40 DatasetRef, 

41 DatasetRefURIs, 

42 DatasetType, 

43 DatasetTypeNotSupportedError, 

44 Datastore, 

45 DatastoreCacheManager, 

46 DatastoreConfig, 

47 DatastoreDisabledCacheManager, 

48 DatastoreRecordData, 

49 DatastoreValidationError, 

50 FileDataset, 

51 FileDescriptor, 

52 FileTemplates, 

53 FileTemplateValidationError, 

54 Formatter, 

55 FormatterFactory, 

56 Location, 

57 LocationFactory, 

58 Progress, 

59 StorageClass, 

60 StoredDatastoreItemInfo, 

61 StoredFileInfo, 

62 ddl, 

63) 

64from lsst.daf.butler.core.repoRelocation import replaceRoot 

65from lsst.daf.butler.core.utils import transactional 

66from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

67from lsst.resources import ResourcePath, ResourcePathExpression 

68from lsst.utils.introspection import get_class_of, get_instance_of 

69from lsst.utils.iteration import chunk_iterable 

70 

71# For VERBOSE logging usage. 

72from lsst.utils.logging import VERBOSE, getLogger 

73from lsst.utils.timer import time_this 

74from sqlalchemy import BigInteger, String 

75 

76from ..registry.interfaces import FakeDatasetRef 

77from .genericDatastore import GenericBaseDatastore 

78 

79if TYPE_CHECKING: 

80 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

81 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

82 

83log = getLogger(__name__) 

84 

85 

86class _IngestPrepData(Datastore.IngestPrepData): 

87 """Helper class for FileDatastore ingest implementation. 

88 

89 Parameters 

90 ---------- 

91 datasets : `~collections.abc.Iterable` of `FileDataset` 

92 Files to be ingested by this datastore. 

93 """ 

94 

95 def __init__(self, datasets: Iterable[FileDataset]): 

96 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

97 self.datasets = datasets 

98 

99 

100@dataclass(frozen=True) 

101class DatastoreFileGetInformation: 

102 """Collection of useful parameters needed to retrieve a file from 

103 a Datastore. 

104 """ 

105 

106 location: Location 

107 """The location from which to read the dataset.""" 

108 

109 formatter: Formatter 

110 """The `Formatter` to use to deserialize the dataset.""" 

111 

112 info: StoredFileInfo 

113 """Stored information about this file and its formatter.""" 

114 

115 assemblerParams: Mapping[str, Any] 

116 """Parameters to use for post-processing the retrieved dataset.""" 

117 

118 formatterParams: Mapping[str, Any] 

119 """Parameters that were understood by the associated formatter.""" 

120 

121 component: str | None 

122 """The component to be retrieved (can be `None`).""" 

123 

124 readStorageClass: StorageClass 

125 """The `StorageClass` of the dataset being read.""" 

126 

127 

128class FileDatastore(GenericBaseDatastore): 

129 """Generic Datastore for file-based implementations. 

130 

131 Should always be sub-classed since key abstract methods are missing. 

132 

133 Parameters 

134 ---------- 

135 config : `DatastoreConfig` or `str` 

136 Configuration as either a `Config` object or URI to file. 

137 bridgeManager : `DatastoreRegistryBridgeManager` 

138 Object that manages the interface between `Registry` and datastores. 

139 butlerRoot : `str`, optional 

140 New datastore root to use to override the configuration value. 

141 

142 Raises 

143 ------ 

144 ValueError 

145 If root location does not exist and ``create`` is `False` in the 

146 configuration. 

147 """ 

148 

149 defaultConfigFile: ClassVar[str | None] = None 

150 """Path to configuration defaults. Accessed within the ``config`` resource 

151 or relative to a search path. Can be None if no defaults specified. 

152 """ 

153 

154 root: ResourcePath 

155 """Root directory URI of this `Datastore`.""" 

156 

157 locationFactory: LocationFactory 

158 """Factory for creating locations relative to the datastore root.""" 

159 

160 formatterFactory: FormatterFactory 

161 """Factory for creating instances of formatters.""" 

162 

163 templates: FileTemplates 

164 """File templates that can be used by this `Datastore`.""" 

165 

166 composites: CompositesMap 

167 """Determines whether a dataset should be disassembled on put.""" 

168 

169 defaultConfigFile = "datastores/fileDatastore.yaml" 

170 """Path to configuration defaults. Accessed within the ``config`` resource 

171 or relative to a search path. Can be None if no defaults specified. 

172 """ 

173 

174 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

175 """Callable that is used in trusted mode to retrieve registry definition 

176 of a named dataset type. 

177 """ 

178 

179 @classmethod 

180 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

181 """Set any filesystem-dependent config options for this Datastore to 

182 be appropriate for a new empty repository with the given root. 

183 

184 Parameters 

185 ---------- 

186 root : `str` 

187 URI to the root of the data repository. 

188 config : `Config` 

189 A `Config` to update. Only the subset understood by 

190 this component will be updated. Will not expand 

191 defaults. 

192 full : `Config` 

193 A complete config with all defaults expanded that can be 

194 converted to a `DatastoreConfig`. Read-only and will not be 

195 modified by this method. 

196 Repository-specific options that should not be obtained 

197 from defaults when Butler instances are constructed 

198 should be copied from ``full`` to ``config``. 

199 overwrite : `bool`, optional 

200 If `False`, do not modify a value in ``config`` if the value 

201 already exists. Default is always to overwrite with the provided 

202 ``root``. 

203 

204 Notes 

205 ----- 

206 If a keyword is explicitly defined in the supplied ``config`` it 

207 will not be overridden by this method if ``overwrite`` is `False`. 

208 This allows explicit values set in external configs to be retained. 

209 """ 

210 Config.updateParameters( 

211 DatastoreConfig, 

212 config, 

213 full, 

214 toUpdate={"root": root}, 

215 toCopy=("cls", ("records", "table")), 

216 overwrite=overwrite, 

217 ) 

218 

219 @classmethod 

220 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

221 return ddl.TableSpec( 

222 fields=[ 

223 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

224 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

225 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

226 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

227 # Use empty string to indicate no component 

228 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

229 # TODO: should checksum be Base64Bytes instead? 

230 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

231 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

232 ], 

233 unique=frozenset(), 

234 indexes=[ddl.IndexSpec("path")], 

235 ) 

236 

237 def __init__( 

238 self, 

239 config: DatastoreConfig | ResourcePathExpression, 

240 bridgeManager: DatastoreRegistryBridgeManager, 

241 butlerRoot: str | None = None, 

242 ): 

243 super().__init__(config, bridgeManager) 

244 if "root" not in self.config: 

245 raise ValueError("No root directory specified in configuration") 

246 

247 self._bridgeManager = bridgeManager 

248 

249 # Name ourselves either using an explicit name or a name 

250 # derived from the (unexpanded) root 

251 if "name" in self.config: 

252 self.name = self.config["name"] 

253 else: 

254 # We use the unexpanded root in the name to indicate that this 

255 # datastore can be moved without having to update registry. 

256 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

257 

258 # Support repository relocation in config 

259 # Existence of self.root is checked in subclass 

260 self.root = ResourcePath( 

261 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

262 ) 

263 

264 self.locationFactory = LocationFactory(self.root) 

265 self.formatterFactory = FormatterFactory() 

266 

267 # Now associate formatters with storage classes 

268 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

269 

270 # Read the file naming templates 

271 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

272 

273 # See if composites should be disassembled 

274 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

275 

276 tableName = self.config["records", "table"] 

277 try: 

278 # Storage of paths and formatters, keyed by dataset_id 

279 self._table = bridgeManager.opaque.register( 

280 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

281 ) 

282 # Interface to Registry. 

283 self._bridge = bridgeManager.register(self.name) 

284 except ReadOnlyDatabaseError: 

285 # If the database is read only and we just tried and failed to 

286 # create a table, it means someone is trying to create a read-only 

287 # butler client for an empty repo. That should be okay, as long 

288 # as they then try to get any datasets before some other client 

289 # creates the table. Chances are they'rejust validating 

290 # configuration. 

291 pass 

292 

293 # Determine whether checksums should be used - default to False 

294 self.useChecksum = self.config.get("checksum", False) 

295 

296 # Determine whether we can fall back to configuration if a 

297 # requested dataset is not known to registry 

298 self.trustGetRequest = self.config.get("trust_get_request", False) 

299 

300 # Create a cache manager 

301 self.cacheManager: AbstractDatastoreCacheManager 

302 if "cached" in self.config: 

303 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

304 else: 

305 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

306 

307 # Check existence and create directory structure if necessary 

308 if not self.root.exists(): 

309 if "create" not in self.config or not self.config["create"]: 

310 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

311 try: 

312 self.root.mkdir() 

313 except Exception as e: 

314 raise ValueError( 

315 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

316 ) from e 

317 

318 def __str__(self) -> str: 

319 return str(self.root) 

320 

321 @property 

322 def bridge(self) -> DatastoreRegistryBridge: 

323 return self._bridge 

324 

325 @property 

326 def roots(self) -> dict[str, ResourcePath | None]: 

327 # Docstring inherited. 

328 return {self.name: self.root} 

329 

330 def _artifact_exists(self, location: Location) -> bool: 

331 """Check that an artifact exists in this datastore at the specified 

332 location. 

333 

334 Parameters 

335 ---------- 

336 location : `Location` 

337 Expected location of the artifact associated with this datastore. 

338 

339 Returns 

340 ------- 

341 exists : `bool` 

342 True if the location can be found, false otherwise. 

343 """ 

344 log.debug("Checking if resource exists: %s", location.uri) 

345 return location.uri.exists() 

346 

347 def _delete_artifact(self, location: Location) -> None: 

348 """Delete the artifact from the datastore. 

349 

350 Parameters 

351 ---------- 

352 location : `Location` 

353 Location of the artifact associated with this datastore. 

354 """ 

355 if location.pathInStore.isabs(): 

356 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

357 

358 try: 

359 location.uri.remove() 

360 except FileNotFoundError: 

361 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

362 raise 

363 except Exception as e: 

364 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

365 raise 

366 log.debug("Successfully deleted file: %s", location.uri) 

367 

368 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

369 # Docstring inherited from GenericBaseDatastore 

370 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos, strict=True)] 

371 self._table.insert(*records, transaction=self._transaction) 

372 

373 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]: 

374 # Docstring inherited from GenericBaseDatastore 

375 

376 # Look for the dataset_id -- there might be multiple matches 

377 # if we have disassembled the dataset. 

378 records = self._table.fetch(dataset_id=ref.id) 

379 return [StoredFileInfo.from_record(record) for record in records] 

380 

381 def _get_stored_records_associated_with_refs( 

382 self, refs: Iterable[DatasetIdRef] 

383 ) -> dict[DatasetId, list[StoredFileInfo]]: 

384 """Retrieve all records associated with the provided refs. 

385 

386 Parameters 

387 ---------- 

388 refs : iterable of `DatasetIdRef` 

389 The refs for which records are to be retrieved. 

390 

391 Returns 

392 ------- 

393 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

394 The matching records indexed by the ref ID. The number of entries 

395 in the dict can be smaller than the number of requested refs. 

396 """ 

397 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

398 

399 # Uniqueness is dataset_id + component so can have multiple records 

400 # per ref. 

401 records_by_ref = defaultdict(list) 

402 for record in records: 

403 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

404 return records_by_ref 

405 

406 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

407 """Return paths and associated dataset refs. 

408 

409 Parameters 

410 ---------- 

411 paths : `list` of `str` or `lsst.resources.ResourcePath` 

412 All the paths to include in search. 

413 

414 Returns 

415 ------- 

416 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

417 Mapping of each path to a set of associated database IDs. 

418 """ 

419 records = self._table.fetch(path=[str(path) for path in paths]) 

420 result = defaultdict(set) 

421 for row in records: 

422 result[row["path"]].add(row["dataset_id"]) 

423 return result 

424 

425 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

426 """Return all dataset refs associated with the supplied path. 

427 

428 Parameters 

429 ---------- 

430 pathInStore : `lsst.resources.ResourcePath` 

431 Path of interest in the data store. 

432 

433 Returns 

434 ------- 

435 ids : `set` of `int` 

436 All `DatasetRef` IDs associated with this path. 

437 """ 

438 records = list(self._table.fetch(path=str(pathInStore))) 

439 ids = {r["dataset_id"] for r in records} 

440 return ids 

441 

442 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

443 # Docstring inherited from GenericBaseDatastore 

444 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

445 

446 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]: 

447 r"""Find all the `Location`\ s of the requested dataset in the 

448 `Datastore` and the associated stored file information. 

449 

450 Parameters 

451 ---------- 

452 ref : `DatasetRef` 

453 Reference to the required `Dataset`. 

454 

455 Returns 

456 ------- 

457 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

458 Location of the dataset within the datastore and 

459 stored information about each file and its formatter. 

460 """ 

461 # Get the file information (this will fail if no file) 

462 records = self.getStoredItemsInfo(ref) 

463 

464 # Use the path to determine the location -- we need to take 

465 # into account absolute URIs in the datastore record 

466 return [(r.file_location(self.locationFactory), r) for r in records] 

467 

468 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

469 """Check that there is only one dataset associated with the 

470 specified artifact. 

471 

472 Parameters 

473 ---------- 

474 ref : `DatasetRef` or `FakeDatasetRef` 

475 Dataset to be removed. 

476 location : `Location` 

477 The location of the artifact to be removed. 

478 

479 Returns 

480 ------- 

481 can_remove : `Bool` 

482 True if the artifact can be safely removed. 

483 """ 

484 # Can't ever delete absolute URIs. 

485 if location.pathInStore.isabs(): 

486 return False 

487 

488 # Get all entries associated with this path 

489 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

490 if not allRefs: 

491 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

492 

493 # Remove these refs from all the refs and if there is nothing left 

494 # then we can delete 

495 remainingRefs = allRefs - {ref.id} 

496 

497 if remainingRefs: 

498 return False 

499 return True 

500 

501 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

502 """Predict the location and related file information of the requested 

503 dataset in this datastore. 

504 

505 Parameters 

506 ---------- 

507 ref : `DatasetRef` 

508 Reference to the required `Dataset`. 

509 

510 Returns 

511 ------- 

512 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

513 Expected Location of the dataset within the datastore and 

514 placeholder information about each file and its formatter. 

515 

516 Notes 

517 ----- 

518 Uses the current configuration to determine how we would expect the 

519 datastore files to have been written if we couldn't ask registry. 

520 This is safe so long as there has been no change to datastore 

521 configuration between writing the dataset and wanting to read it. 

522 Will not work for files that have been ingested without using the 

523 standard file template or default formatter. 

524 """ 

525 # If we have a component ref we always need to ask the questions 

526 # of the composite. If the composite is disassembled this routine 

527 # should return all components. If the composite was not 

528 # disassembled the composite is what is stored regardless of 

529 # component request. Note that if the caller has disassembled 

530 # a composite there is no way for this guess to know that 

531 # without trying both the composite and component ref and seeing 

532 # if there is something at the component Location even without 

533 # disassembly being enabled. 

534 if ref.datasetType.isComponent(): 

535 ref = ref.makeCompositeRef() 

536 

537 # See if the ref is a composite that should be disassembled 

538 doDisassembly = self.composites.shouldBeDisassembled(ref) 

539 

540 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

541 

542 if doDisassembly: 

543 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

544 compRef = ref.makeComponentRef(component) 

545 location, formatter = self._determine_put_formatter_location(compRef) 

546 all_info.append((location, formatter, componentStorage, component)) 

547 

548 else: 

549 # Always use the composite ref if no disassembly 

550 location, formatter = self._determine_put_formatter_location(ref) 

551 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

552 

553 # Convert the list of tuples to have StoredFileInfo as second element 

554 return [ 

555 ( 

556 location, 

557 StoredFileInfo( 

558 formatter=formatter, 

559 path=location.pathInStore.path, 

560 storageClass=storageClass, 

561 component=component, 

562 checksum=None, 

563 file_size=-1, 

564 dataset_id=ref.id, 

565 ), 

566 ) 

567 for location, formatter, storageClass, component in all_info 

568 ] 

569 

570 def _prepare_for_get( 

571 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

572 ) -> list[DatastoreFileGetInformation]: 

573 """Check parameters for ``get`` and obtain formatter and 

574 location. 

575 

576 Parameters 

577 ---------- 

578 ref : `DatasetRef` 

579 Reference to the required Dataset. 

580 parameters : `dict` 

581 `StorageClass`-specific parameters that specify, for example, 

582 a slice of the dataset to be loaded. 

583 

584 Returns 

585 ------- 

586 getInfo : `list` [`DatastoreFileGetInformation`] 

587 Parameters needed to retrieve each file. 

588 """ 

589 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

590 

591 # The storage class we want to use eventually 

592 refStorageClass = ref.datasetType.storageClass 

593 

594 # For trusted mode need to reset storage class. 

595 ref = self._cast_storage_class(ref) 

596 

597 # Get file metadata and internal metadata 

598 fileLocations = self._get_dataset_locations_info(ref) 

599 if not fileLocations: 

600 if not self.trustGetRequest: 

601 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

602 # Assume the dataset is where we think it should be 

603 fileLocations = self._get_expected_dataset_locations_info(ref) 

604 

605 if len(fileLocations) > 1: 

606 disassembled = True 

607 

608 # If trust is involved it is possible that there will be 

609 # components listed here that do not exist in the datastore. 

610 # Explicitly check for file artifact existence and filter out any 

611 # that are missing. 

612 if self.trustGetRequest: 

613 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

614 

615 # For now complain only if we have no components at all. One 

616 # component is probably a problem but we can punt that to the 

617 # assembler. 

618 if not fileLocations: 

619 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

620 

621 else: 

622 disassembled = False 

623 

624 # Is this a component request? 

625 refComponent = ref.datasetType.component() 

626 

627 fileGetInfo = [] 

628 for location, storedFileInfo in fileLocations: 

629 # The storage class used to write the file 

630 writeStorageClass = storedFileInfo.storageClass 

631 

632 # If this has been disassembled we need read to match the write 

633 if disassembled: 

634 readStorageClass = writeStorageClass 

635 else: 

636 readStorageClass = refStorageClass 

637 

638 formatter = get_instance_of( 

639 storedFileInfo.formatter, 

640 FileDescriptor( 

641 location, 

642 readStorageClass=readStorageClass, 

643 storageClass=writeStorageClass, 

644 parameters=parameters, 

645 ), 

646 ref.dataId, 

647 ) 

648 

649 formatterParams, notFormatterParams = formatter.segregateParameters() 

650 

651 # Of the remaining parameters, extract the ones supported by 

652 # this StorageClass (for components not all will be handled) 

653 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

654 

655 # The ref itself could be a component if the dataset was 

656 # disassembled by butler, or we disassembled in datastore and 

657 # components came from the datastore records 

658 component = storedFileInfo.component if storedFileInfo.component else refComponent 

659 

660 fileGetInfo.append( 

661 DatastoreFileGetInformation( 

662 location, 

663 formatter, 

664 storedFileInfo, 

665 assemblerParams, 

666 formatterParams, 

667 component, 

668 readStorageClass, 

669 ) 

670 ) 

671 

672 return fileGetInfo 

673 

674 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

675 """Check the arguments for ``put`` and obtain formatter and 

676 location. 

677 

678 Parameters 

679 ---------- 

680 inMemoryDataset : `object` 

681 The dataset to store. 

682 ref : `DatasetRef` 

683 Reference to the associated Dataset. 

684 

685 Returns 

686 ------- 

687 location : `Location` 

688 The location to write the dataset. 

689 formatter : `Formatter` 

690 The `Formatter` to use to write the dataset. 

691 

692 Raises 

693 ------ 

694 TypeError 

695 Supplied object and storage class are inconsistent. 

696 DatasetTypeNotSupportedError 

697 The associated `DatasetType` is not handled by this datastore. 

698 """ 

699 self._validate_put_parameters(inMemoryDataset, ref) 

700 return self._determine_put_formatter_location(ref) 

701 

702 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

703 """Calculate the formatter and output location to use for put. 

704 

705 Parameters 

706 ---------- 

707 ref : `DatasetRef` 

708 Reference to the associated Dataset. 

709 

710 Returns 

711 ------- 

712 location : `Location` 

713 The location to write the dataset. 

714 formatter : `Formatter` 

715 The `Formatter` to use to write the dataset. 

716 """ 

717 # Work out output file name 

718 try: 

719 template = self.templates.getTemplate(ref) 

720 except KeyError as e: 

721 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

722 

723 # Validate the template to protect against filenames from different 

724 # dataIds returning the same and causing overwrite confusion. 

725 template.validateTemplate(ref) 

726 

727 location = self.locationFactory.fromPath(template.format(ref)) 

728 

729 # Get the formatter based on the storage class 

730 storageClass = ref.datasetType.storageClass 

731 try: 

732 formatter = self.formatterFactory.getFormatter( 

733 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

734 ) 

735 except KeyError as e: 

736 raise DatasetTypeNotSupportedError( 

737 f"Unable to find formatter for {ref} in datastore {self.name}" 

738 ) from e 

739 

740 # Now that we know the formatter, update the location 

741 location = formatter.makeUpdatedLocation(location) 

742 

743 return location, formatter 

744 

745 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

746 # Docstring inherited from base class 

747 if transfer != "auto": 

748 return transfer 

749 

750 # See if the paths are within the datastore or not 

751 inside = [self._pathInStore(d.path) is not None for d in datasets] 

752 

753 if all(inside): 

754 transfer = None 

755 elif not any(inside): 

756 # Allow ResourcePath to use its own knowledge 

757 transfer = "auto" 

758 else: 

759 # This can happen when importing from a datastore that 

760 # has had some datasets ingested using "direct" mode. 

761 # Also allow ResourcePath to sort it out but warn about it. 

762 # This can happen if you are importing from a datastore 

763 # that had some direct transfer datasets. 

764 log.warning( 

765 "Some datasets are inside the datastore and some are outside. Using 'split' " 

766 "transfer mode. This assumes that the files outside the datastore are " 

767 "still accessible to the new butler since they will not be copied into " 

768 "the target datastore." 

769 ) 

770 transfer = "split" 

771 

772 return transfer 

773 

774 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

775 """Return path relative to datastore root. 

776 

777 Parameters 

778 ---------- 

779 path : `lsst.resources.ResourcePathExpression` 

780 Path to dataset. Can be absolute URI. If relative assumed to 

781 be relative to the datastore. Returns path in datastore 

782 or raises an exception if the path it outside. 

783 

784 Returns 

785 ------- 

786 inStore : `str` 

787 Path relative to datastore root. Returns `None` if the file is 

788 outside the root. 

789 """ 

790 # Relative path will always be relative to datastore 

791 pathUri = ResourcePath(path, forceAbsolute=False) 

792 return pathUri.relative_to(self.root) 

793 

794 def _standardizeIngestPath( 

795 self, path: str | ResourcePath, *, transfer: str | None = None 

796 ) -> str | ResourcePath: 

797 """Standardize the path of a to-be-ingested file. 

798 

799 Parameters 

800 ---------- 

801 path : `str` or `lsst.resources.ResourcePath` 

802 Path of a file to be ingested. This parameter is not expected 

803 to be all the types that can be used to construct a 

804 `~lsst.resources.ResourcePath`. 

805 transfer : `str`, optional 

806 How (and whether) the dataset should be added to the datastore. 

807 See `ingest` for details of transfer modes. 

808 This implementation is provided only so 

809 `NotImplementedError` can be raised if the mode is not supported; 

810 actual transfers are deferred to `_extractIngestInfo`. 

811 

812 Returns 

813 ------- 

814 path : `str` or `lsst.resources.ResourcePath` 

815 New path in what the datastore considers standard form. If an 

816 absolute URI was given that will be returned unchanged. 

817 

818 Notes 

819 ----- 

820 Subclasses of `FileDatastore` can implement this method instead 

821 of `_prepIngest`. It should not modify the data repository or given 

822 file in any way. 

823 

824 Raises 

825 ------ 

826 NotImplementedError 

827 Raised if the datastore does not support the given transfer mode 

828 (including the case where ingest is not supported at all). 

829 FileNotFoundError 

830 Raised if one of the given files does not exist. 

831 """ 

832 if transfer not in (None, "direct", "split") + self.root.transferModes: 

833 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

834 

835 # A relative URI indicates relative to datastore root 

836 srcUri = ResourcePath(path, forceAbsolute=False) 

837 if not srcUri.isabs(): 

838 srcUri = self.root.join(path) 

839 

840 if not srcUri.exists(): 

841 raise FileNotFoundError( 

842 f"Resource at {srcUri} does not exist; note that paths to ingest " 

843 f"are assumed to be relative to {self.root} unless they are absolute." 

844 ) 

845 

846 if transfer is None: 

847 relpath = srcUri.relative_to(self.root) 

848 if not relpath: 

849 raise RuntimeError( 

850 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

851 ) 

852 

853 # Return the relative path within the datastore for internal 

854 # transfer 

855 path = relpath 

856 

857 return path 

858 

859 def _extractIngestInfo( 

860 self, 

861 path: ResourcePathExpression, 

862 ref: DatasetRef, 

863 *, 

864 formatter: Formatter | type[Formatter], 

865 transfer: str | None = None, 

866 record_validation_info: bool = True, 

867 ) -> StoredFileInfo: 

868 """Relocate (if necessary) and extract `StoredFileInfo` from a 

869 to-be-ingested file. 

870 

871 Parameters 

872 ---------- 

873 path : `lsst.resources.ResourcePathExpression` 

874 URI or path of a file to be ingested. 

875 ref : `DatasetRef` 

876 Reference for the dataset being ingested. Guaranteed to have 

877 ``dataset_id not None`. 

878 formatter : `type` or `Formatter` 

879 `Formatter` subclass to use for this dataset or an instance. 

880 transfer : `str`, optional 

881 How (and whether) the dataset should be added to the datastore. 

882 See `ingest` for details of transfer modes. 

883 record_validation_info : `bool`, optional 

884 If `True`, the default, the datastore can record validation 

885 information associated with the file. If `False` the datastore 

886 will not attempt to track any information such as checksums 

887 or file sizes. This can be useful if such information is tracked 

888 in an external system or if the file is to be compressed in place. 

889 It is up to the datastore whether this parameter is relevant. 

890 

891 Returns 

892 ------- 

893 info : `StoredFileInfo` 

894 Internal datastore record for this file. This will be inserted by 

895 the caller; the `_extractIngestInfo` is only responsible for 

896 creating and populating the struct. 

897 

898 Raises 

899 ------ 

900 FileNotFoundError 

901 Raised if one of the given files does not exist. 

902 FileExistsError 

903 Raised if transfer is not `None` but the (internal) location the 

904 file would be moved to is already occupied. 

905 """ 

906 if self._transaction is None: 

907 raise RuntimeError("Ingest called without transaction enabled") 

908 

909 # Create URI of the source path, do not need to force a relative 

910 # path to absolute. 

911 srcUri = ResourcePath(path, forceAbsolute=False) 

912 

913 # Track whether we have read the size of the source yet 

914 have_sized = False 

915 

916 tgtLocation: Location | None 

917 if transfer is None or transfer == "split": 

918 # A relative path is assumed to be relative to the datastore 

919 # in this context 

920 if not srcUri.isabs(): 

921 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

922 else: 

923 # Work out the path in the datastore from an absolute URI 

924 # This is required to be within the datastore. 

925 pathInStore = srcUri.relative_to(self.root) 

926 if pathInStore is None and transfer is None: 

927 raise RuntimeError( 

928 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

929 ) 

930 if pathInStore: 

931 tgtLocation = self.locationFactory.fromPath(pathInStore) 

932 elif transfer == "split": 

933 # Outside the datastore but treat that as a direct ingest 

934 # instead. 

935 tgtLocation = None 

936 else: 

937 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

938 elif transfer == "direct": 

939 # Want to store the full URI to the resource directly in 

940 # datastore. This is useful for referring to permanent archive 

941 # storage for raw data. 

942 # Trust that people know what they are doing. 

943 tgtLocation = None 

944 else: 

945 # Work out the name we want this ingested file to have 

946 # inside the datastore 

947 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

948 if not tgtLocation.uri.dirname().exists(): 

949 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

950 tgtLocation.uri.dirname().mkdir() 

951 

952 # if we are transferring from a local file to a remote location 

953 # it may be more efficient to get the size and checksum of the 

954 # local file rather than the transferred one 

955 if record_validation_info and srcUri.isLocal: 

956 size = srcUri.size() 

957 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

958 have_sized = True 

959 

960 # Transfer the resource to the destination. 

961 # Allow overwrite of an existing file. This matches the behavior 

962 # of datastore.put() in that it trusts that registry would not 

963 # be asking to overwrite unless registry thought that the 

964 # overwrite was allowed. 

965 tgtLocation.uri.transfer_from( 

966 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

967 ) 

968 

969 if tgtLocation is None: 

970 # This means we are using direct mode 

971 targetUri = srcUri 

972 targetPath = str(srcUri) 

973 else: 

974 targetUri = tgtLocation.uri 

975 targetPath = tgtLocation.pathInStore.path 

976 

977 # the file should exist in the datastore now 

978 if record_validation_info: 

979 if not have_sized: 

980 size = targetUri.size() 

981 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

982 else: 

983 # Not recording any file information. 

984 size = -1 

985 checksum = None 

986 

987 return StoredFileInfo( 

988 formatter=formatter, 

989 path=targetPath, 

990 storageClass=ref.datasetType.storageClass, 

991 component=ref.datasetType.component(), 

992 file_size=size, 

993 checksum=checksum, 

994 dataset_id=ref.id, 

995 ) 

996 

997 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

998 # Docstring inherited from Datastore._prepIngest. 

999 filtered = [] 

1000 for dataset in datasets: 

1001 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1002 if not acceptable: 

1003 continue 

1004 else: 

1005 dataset.refs = acceptable 

1006 if dataset.formatter is None: 

1007 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1008 else: 

1009 assert isinstance(dataset.formatter, type | str) 

1010 formatter_class = get_class_of(dataset.formatter) 

1011 if not issubclass(formatter_class, Formatter): 

1012 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1013 dataset.formatter = formatter_class 

1014 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1015 filtered.append(dataset) 

1016 return _IngestPrepData(filtered) 

1017 

1018 @transactional 

1019 def _finishIngest( 

1020 self, 

1021 prepData: Datastore.IngestPrepData, 

1022 *, 

1023 transfer: str | None = None, 

1024 record_validation_info: bool = True, 

1025 ) -> None: 

1026 # Docstring inherited from Datastore._finishIngest. 

1027 refsAndInfos = [] 

1028 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1029 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1030 # Do ingest as if the first dataset ref is associated with the file 

1031 info = self._extractIngestInfo( 

1032 dataset.path, 

1033 dataset.refs[0], 

1034 formatter=dataset.formatter, 

1035 transfer=transfer, 

1036 record_validation_info=record_validation_info, 

1037 ) 

1038 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1039 self._register_datasets(refsAndInfos) 

1040 

1041 def _calculate_ingested_datastore_name( 

1042 self, 

1043 srcUri: ResourcePath, 

1044 ref: DatasetRef, 

1045 formatter: Formatter | type[Formatter] | None = None, 

1046 ) -> Location: 

1047 """Given a source URI and a DatasetRef, determine the name the 

1048 dataset will have inside datastore. 

1049 

1050 Parameters 

1051 ---------- 

1052 srcUri : `lsst.resources.ResourcePath` 

1053 URI to the source dataset file. 

1054 ref : `DatasetRef` 

1055 Ref associated with the newly-ingested dataset artifact. This 

1056 is used to determine the name within the datastore. 

1057 formatter : `Formatter` or Formatter class. 

1058 Formatter to use for validation. Can be a class or an instance. 

1059 No validation of the file extension is performed if the 

1060 ``formatter`` is `None`. This can be used if the caller knows 

1061 that the source URI and target URI will use the same formatter. 

1062 

1063 Returns 

1064 ------- 

1065 location : `Location` 

1066 Target location for the newly-ingested dataset. 

1067 """ 

1068 # Ingesting a file from outside the datastore. 

1069 # This involves a new name. 

1070 template = self.templates.getTemplate(ref) 

1071 location = self.locationFactory.fromPath(template.format(ref)) 

1072 

1073 # Get the extension 

1074 ext = srcUri.getExtension() 

1075 

1076 # Update the destination to include that extension 

1077 location.updateExtension(ext) 

1078 

1079 # Ask the formatter to validate this extension 

1080 if formatter is not None: 

1081 formatter.validateExtension(location) 

1082 

1083 return location 

1084 

1085 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1086 """Write out in memory dataset to datastore. 

1087 

1088 Parameters 

1089 ---------- 

1090 inMemoryDataset : `object` 

1091 Dataset to write to datastore. 

1092 ref : `DatasetRef` 

1093 Registry information associated with this dataset. 

1094 

1095 Returns 

1096 ------- 

1097 info : `StoredFileInfo` 

1098 Information describing the artifact written to the datastore. 

1099 """ 

1100 # May need to coerce the in memory dataset to the correct 

1101 # python type, but first we need to make sure the storage class 

1102 # reflects the one defined in the data repository. 

1103 ref = self._cast_storage_class(ref) 

1104 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1105 

1106 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1107 uri = location.uri 

1108 

1109 if not uri.dirname().exists(): 

1110 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1111 uri.dirname().mkdir() 

1112 

1113 if self._transaction is None: 

1114 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1115 

1116 def _removeFileExists(uri: ResourcePath) -> None: 

1117 """Remove a file and do not complain if it is not there. 

1118 

1119 This is important since a formatter might fail before the file 

1120 is written and we should not confuse people by writing spurious 

1121 error messages to the log. 

1122 """ 

1123 with contextlib.suppress(FileNotFoundError): 

1124 uri.remove() 

1125 

1126 # Register a callback to try to delete the uploaded data if 

1127 # something fails below 

1128 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1129 

1130 data_written = False 

1131 

1132 # For remote URIs some datasets can be serialized directly 

1133 # to bytes and sent to the remote datastore without writing a 

1134 # file. If the dataset is intended to be saved to the cache 

1135 # a file is always written and direct write to the remote 

1136 # datastore is bypassed. 

1137 if not uri.isLocal and not self.cacheManager.should_be_cached(ref): 

1138 # Remote URI that is not cached so can write directly. 

1139 try: 

1140 serializedDataset = formatter.toBytes(inMemoryDataset) 

1141 except NotImplementedError: 

1142 # Fallback to the file writing option. 

1143 pass 

1144 except Exception as e: 

1145 raise RuntimeError( 

1146 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1147 ) from e 

1148 else: 

1149 log.debug("Writing bytes directly to %s", uri) 

1150 uri.write(serializedDataset, overwrite=True) 

1151 log.debug("Successfully wrote bytes directly to %s", uri) 

1152 data_written = True 

1153 

1154 if not data_written: 

1155 # Did not write the bytes directly to object store so instead 

1156 # write to temporary file. Always write to a temporary even if 

1157 # using a local file system -- that gives us atomic writes. 

1158 # If a process is killed as the file is being written we do not 

1159 # want it to remain in the correct place but in corrupt state. 

1160 # For local files write to the output directory not temporary dir. 

1161 prefix = uri.dirname() if uri.isLocal else None 

1162 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1163 # Need to configure the formatter to write to a different 

1164 # location and that needs us to overwrite internals 

1165 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1166 with formatter._updateLocation(Location(None, temporary_uri)): 

1167 try: 

1168 formatter.write(inMemoryDataset) 

1169 except Exception as e: 

1170 raise RuntimeError( 

1171 f"Failed to serialize dataset {ref} of type" 

1172 f" {type(inMemoryDataset)} to " 

1173 f"temporary location {temporary_uri}" 

1174 ) from e 

1175 

1176 # Use move for a local file since that becomes an efficient 

1177 # os.rename. For remote resources we use copy to allow the 

1178 # file to be cached afterwards. 

1179 transfer = "move" if uri.isLocal else "copy" 

1180 

1181 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1182 

1183 if transfer == "copy": 

1184 # Cache if required 

1185 self.cacheManager.move_to_cache(temporary_uri, ref) 

1186 

1187 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1188 

1189 # URI is needed to resolve what ingest case are we dealing with 

1190 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1191 

1192 def _read_artifact_into_memory( 

1193 self, 

1194 getInfo: DatastoreFileGetInformation, 

1195 ref: DatasetRef, 

1196 isComponent: bool = False, 

1197 cache_ref: DatasetRef | None = None, 

1198 ) -> Any: 

1199 """Read the artifact from datastore into in memory object. 

1200 

1201 Parameters 

1202 ---------- 

1203 getInfo : `DatastoreFileGetInformation` 

1204 Information about the artifact within the datastore. 

1205 ref : `DatasetRef` 

1206 The registry information associated with this artifact. 

1207 isComponent : `bool` 

1208 Flag to indicate if a component is being read from this artifact. 

1209 cache_ref : `DatasetRef`, optional 

1210 The DatasetRef to use when looking up the file in the cache. 

1211 This ref must have the same ID as the supplied ref but can 

1212 be a parent ref or component ref to indicate to the cache whether 

1213 a composite file is being requested from the cache or a component 

1214 file. Without this the cache will default to the supplied ref but 

1215 it can get confused with read-only derived components for 

1216 disassembled composites. 

1217 

1218 Returns 

1219 ------- 

1220 inMemoryDataset : `object` 

1221 The artifact as a python object. 

1222 """ 

1223 location = getInfo.location 

1224 uri = location.uri 

1225 log.debug("Accessing data from %s", uri) 

1226 

1227 if cache_ref is None: 

1228 cache_ref = ref 

1229 if cache_ref.id != ref.id: 

1230 raise ValueError( 

1231 "The supplied cache dataset ref refers to a different dataset than expected:" 

1232 f" {ref.id} != {cache_ref.id}" 

1233 ) 

1234 

1235 # Cannot recalculate checksum but can compare size as a quick check 

1236 # Do not do this if the size is negative since that indicates 

1237 # we do not know. 

1238 recorded_size = getInfo.info.file_size 

1239 resource_size = uri.size() 

1240 if recorded_size >= 0 and resource_size != recorded_size: 

1241 raise RuntimeError( 

1242 "Integrity failure in Datastore. " 

1243 f"Size of file {uri} ({resource_size}) " 

1244 f"does not match size recorded in registry of {recorded_size}" 

1245 ) 

1246 

1247 # For the general case we have choices for how to proceed. 

1248 # 1. Always use a local file (downloading the remote resource to a 

1249 # temporary file if needed). 

1250 # 2. Use a threshold size and read into memory and use bytes. 

1251 # Use both for now with an arbitrary hand off size. 

1252 # This allows small datasets to be downloaded from remote object 

1253 # stores without requiring a temporary file. 

1254 

1255 formatter = getInfo.formatter 

1256 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1257 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1258 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1259 if cached_file is not None: 

1260 desired_uri = cached_file 

1261 msg = f" (cached version of {uri})" 

1262 else: 

1263 desired_uri = uri 

1264 msg = "" 

1265 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1266 serializedDataset = desired_uri.read() 

1267 log.debug( 

1268 "Deserializing %s from %d bytes from location %s with formatter %s", 

1269 f"component {getInfo.component}" if isComponent else "", 

1270 len(serializedDataset), 

1271 uri, 

1272 formatter.name(), 

1273 ) 

1274 try: 

1275 result = formatter.fromBytes( 

1276 serializedDataset, component=getInfo.component if isComponent else None 

1277 ) 

1278 except Exception as e: 

1279 raise ValueError( 

1280 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1281 f" ({ref.datasetType.name} from {uri}): {e}" 

1282 ) from e 

1283 else: 

1284 # Read from file. 

1285 

1286 # Have to update the Location associated with the formatter 

1287 # because formatter.read does not allow an override. 

1288 # This could be improved. 

1289 location_updated = False 

1290 msg = "" 

1291 

1292 # First check in cache for local version. 

1293 # The cache will only be relevant for remote resources but 

1294 # no harm in always asking. Context manager ensures that cache 

1295 # file is not deleted during cache expiration. 

1296 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1297 if cached_file is not None: 

1298 msg = f"(via cache read of remote file {uri})" 

1299 uri = cached_file 

1300 location_updated = True 

1301 

1302 with uri.as_local() as local_uri: 

1303 can_be_cached = False 

1304 if uri != local_uri: 

1305 # URI was remote and file was downloaded 

1306 cache_msg = "" 

1307 location_updated = True 

1308 

1309 if self.cacheManager.should_be_cached(cache_ref): 

1310 # In this scenario we want to ask if the downloaded 

1311 # file should be cached but we should not cache 

1312 # it until after we've used it (to ensure it can't 

1313 # be expired whilst we are using it). 

1314 can_be_cached = True 

1315 

1316 # Say that it is "likely" to be cached because 

1317 # if the formatter read fails we will not be 

1318 # caching this file. 

1319 cache_msg = " and likely cached" 

1320 

1321 msg = f"(via download to local file{cache_msg})" 

1322 

1323 # Calculate the (possibly) new location for the formatter 

1324 # to use. 

1325 newLocation = Location(*local_uri.split()) if location_updated else None 

1326 

1327 log.debug( 

1328 "Reading%s from location %s %s with formatter %s", 

1329 f" component {getInfo.component}" if isComponent else "", 

1330 uri, 

1331 msg, 

1332 formatter.name(), 

1333 ) 

1334 try: 

1335 with ( 

1336 formatter._updateLocation(newLocation), 

1337 time_this( 

1338 log, 

1339 msg="Reading%s from location %s %s with formatter %s", 

1340 args=( 

1341 f" component {getInfo.component}" if isComponent else "", 

1342 uri, 

1343 msg, 

1344 formatter.name(), 

1345 ), 

1346 ), 

1347 ): 

1348 result = formatter.read(component=getInfo.component if isComponent else None) 

1349 except Exception as e: 

1350 raise ValueError( 

1351 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1352 f" ({ref.datasetType.name} from {uri}): {e}" 

1353 ) from e 

1354 

1355 # File was read successfully so can move to cache 

1356 if can_be_cached: 

1357 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1358 

1359 return self._post_process_get( 

1360 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1361 ) 

1362 

1363 def knows(self, ref: DatasetRef) -> bool: 

1364 """Check if the dataset is known to the datastore. 

1365 

1366 Does not check for existence of any artifact. 

1367 

1368 Parameters 

1369 ---------- 

1370 ref : `DatasetRef` 

1371 Reference to the required dataset. 

1372 

1373 Returns 

1374 ------- 

1375 exists : `bool` 

1376 `True` if the dataset is known to the datastore. 

1377 """ 

1378 fileLocations = self._get_dataset_locations_info(ref) 

1379 if fileLocations: 

1380 return True 

1381 return False 

1382 

1383 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1384 # Docstring inherited from the base class. 

1385 

1386 # The records themselves. Could be missing some entries. 

1387 records = self._get_stored_records_associated_with_refs(refs) 

1388 

1389 return {ref: ref.id in records for ref in refs} 

1390 

1391 def _process_mexists_records( 

1392 self, 

1393 id_to_ref: dict[DatasetId, DatasetRef], 

1394 records: dict[DatasetId, list[StoredFileInfo]], 

1395 all_required: bool, 

1396 artifact_existence: dict[ResourcePath, bool] | None = None, 

1397 ) -> dict[DatasetRef, bool]: 

1398 """Check given records for existence. 

1399 

1400 Helper function for `mexists()`. 

1401 

1402 Parameters 

1403 ---------- 

1404 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1405 Mapping of the dataset ID to the dataset ref itself. 

1406 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1407 Records as generally returned by 

1408 ``_get_stored_records_associated_with_refs``. 

1409 all_required : `bool` 

1410 Flag to indicate whether existence requires all artifacts 

1411 associated with a dataset ID to exist or not for existence. 

1412 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1413 Optional mapping of datastore artifact to existence. Updated by 

1414 this method with details of all artifacts tested. Can be `None` 

1415 if the caller is not interested. 

1416 

1417 Returns 

1418 ------- 

1419 existence : `dict` of [`DatasetRef`, `bool`] 

1420 Mapping from dataset to boolean indicating existence. 

1421 """ 

1422 # The URIs to be checked and a mapping of those URIs to 

1423 # the dataset ID. 

1424 uris_to_check: list[ResourcePath] = [] 

1425 location_map: dict[ResourcePath, DatasetId] = {} 

1426 

1427 location_factory = self.locationFactory 

1428 

1429 uri_existence: dict[ResourcePath, bool] = {} 

1430 for ref_id, infos in records.items(): 

1431 # Key is the dataset Id, value is list of StoredItemInfo 

1432 uris = [info.file_location(location_factory).uri for info in infos] 

1433 location_map.update({uri: ref_id for uri in uris}) 

1434 

1435 # Check the local cache directly for a dataset corresponding 

1436 # to the remote URI. 

1437 if self.cacheManager.file_count > 0: 

1438 ref = id_to_ref[ref_id] 

1439 for uri, storedFileInfo in zip(uris, infos, strict=True): 

1440 check_ref = ref 

1441 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1442 check_ref = ref.makeComponentRef(component) 

1443 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1444 # Proxy for URI existence. 

1445 uri_existence[uri] = True 

1446 else: 

1447 uris_to_check.append(uri) 

1448 else: 

1449 # Check all of them. 

1450 uris_to_check.extend(uris) 

1451 

1452 if artifact_existence is not None: 

1453 # If a URI has already been checked remove it from the list 

1454 # and immediately add the status to the output dict. 

1455 filtered_uris_to_check = [] 

1456 for uri in uris_to_check: 

1457 if uri in artifact_existence: 

1458 uri_existence[uri] = artifact_existence[uri] 

1459 else: 

1460 filtered_uris_to_check.append(uri) 

1461 uris_to_check = filtered_uris_to_check 

1462 

1463 # Results. 

1464 dataset_existence: dict[DatasetRef, bool] = {} 

1465 

1466 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1467 for uri, exists in uri_existence.items(): 

1468 dataset_id = location_map[uri] 

1469 ref = id_to_ref[dataset_id] 

1470 

1471 # Disassembled composite needs to check all locations. 

1472 # all_required indicates whether all need to exist or not. 

1473 if ref in dataset_existence: 

1474 if all_required: 

1475 exists = dataset_existence[ref] and exists 

1476 else: 

1477 exists = dataset_existence[ref] or exists 

1478 dataset_existence[ref] = exists 

1479 

1480 if artifact_existence is not None: 

1481 artifact_existence.update(uri_existence) 

1482 

1483 return dataset_existence 

1484 

1485 def mexists( 

1486 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1487 ) -> dict[DatasetRef, bool]: 

1488 """Check the existence of multiple datasets at once. 

1489 

1490 Parameters 

1491 ---------- 

1492 refs : iterable of `DatasetRef` 

1493 The datasets to be checked. 

1494 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1495 Optional mapping of datastore artifact to existence. Updated by 

1496 this method with details of all artifacts tested. Can be `None` 

1497 if the caller is not interested. 

1498 

1499 Returns 

1500 ------- 

1501 existence : `dict` of [`DatasetRef`, `bool`] 

1502 Mapping from dataset to boolean indicating existence. 

1503 

1504 Notes 

1505 ----- 

1506 To minimize potentially costly remote existence checks, the local 

1507 cache is checked as a proxy for existence. If a file for this 

1508 `DatasetRef` does exist no check is done for the actual URI. This 

1509 could result in possibly unexpected behavior if the dataset itself 

1510 has been removed from the datastore by another process whilst it is 

1511 still in the cache. 

1512 """ 

1513 chunk_size = 10_000 

1514 dataset_existence: dict[DatasetRef, bool] = {} 

1515 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1516 n_found_total = 0 

1517 n_checked = 0 

1518 n_chunks = 0 

1519 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1520 chunk_result = self._mexists(chunk, artifact_existence) 

1521 

1522 # The log message level and content depend on how many 

1523 # datasets we are processing. 

1524 n_results = len(chunk_result) 

1525 

1526 # Use verbose logging to ensure that messages can be seen 

1527 # easily if many refs are being checked. 

1528 log_threshold = VERBOSE 

1529 n_checked += n_results 

1530 

1531 # This sum can take some time so only do it if we know the 

1532 # result is going to be used. 

1533 n_found = 0 

1534 if log.isEnabledFor(log_threshold): 

1535 # Can treat the booleans as 0, 1 integers and sum them. 

1536 n_found = sum(chunk_result.values()) 

1537 n_found_total += n_found 

1538 

1539 # We are deliberately not trying to count the number of refs 

1540 # provided in case it's in the millions. This means there is a 

1541 # situation where the number of refs exactly matches the chunk 

1542 # size and we will switch to the multi-chunk path even though 

1543 # we only have a single chunk. 

1544 if n_results < chunk_size and n_chunks == 0: 

1545 # Single chunk will be processed so we can provide more detail. 

1546 if n_results == 1: 

1547 ref = list(chunk_result)[0] 

1548 # Use debug logging to be consistent with `exists()`. 

1549 log.debug( 

1550 "Calling mexists() with single ref that does%s exist (%s).", 

1551 "" if chunk_result[ref] else " not", 

1552 ref, 

1553 ) 

1554 else: 

1555 # Single chunk but multiple files. Summarize. 

1556 log.log( 

1557 log_threshold, 

1558 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1559 n_found, 

1560 n_checked, 

1561 ) 

1562 

1563 else: 

1564 # Use incremental verbose logging when we have multiple chunks. 

1565 log.log( 

1566 log_threshold, 

1567 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1568 "(running total from all chunks so far: %d found out of %d checked)", 

1569 n_chunks, 

1570 n_found, 

1571 n_results, 

1572 n_found_total, 

1573 n_checked, 

1574 ) 

1575 dataset_existence.update(chunk_result) 

1576 n_chunks += 1 

1577 

1578 return dataset_existence 

1579 

1580 def _mexists( 

1581 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1582 ) -> dict[DatasetRef, bool]: 

1583 """Check the existence of multiple datasets at once. 

1584 

1585 Parameters 

1586 ---------- 

1587 refs : iterable of `DatasetRef` 

1588 The datasets to be checked. 

1589 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1590 Optional mapping of datastore artifact to existence. Updated by 

1591 this method with details of all artifacts tested. Can be `None` 

1592 if the caller is not interested. 

1593 

1594 Returns 

1595 ------- 

1596 existence : `dict` of [`DatasetRef`, `bool`] 

1597 Mapping from dataset to boolean indicating existence. 

1598 """ 

1599 # Make a mapping from refs with the internal storage class to the given 

1600 # refs that may have a different one. We'll use the internal refs 

1601 # throughout this method and convert back at the very end. 

1602 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1603 

1604 # Need a mapping of dataset_id to (internal) dataset ref since some 

1605 # internal APIs work with dataset_id. 

1606 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1607 

1608 # Set of all IDs we are checking for. 

1609 requested_ids = set(id_to_ref.keys()) 

1610 

1611 # The records themselves. Could be missing some entries. 

1612 records = self._get_stored_records_associated_with_refs(id_to_ref.values()) 

1613 

1614 dataset_existence = self._process_mexists_records( 

1615 id_to_ref, records, True, artifact_existence=artifact_existence 

1616 ) 

1617 

1618 # Set of IDs that have been handled. 

1619 handled_ids = {ref.id for ref in dataset_existence} 

1620 

1621 missing_ids = requested_ids - handled_ids 

1622 if missing_ids: 

1623 dataset_existence.update( 

1624 self._mexists_check_expected( 

1625 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1626 ) 

1627 ) 

1628 

1629 return { 

1630 internal_ref_to_input_ref[internal_ref]: existence 

1631 for internal_ref, existence in dataset_existence.items() 

1632 } 

1633 

1634 def _mexists_check_expected( 

1635 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1636 ) -> dict[DatasetRef, bool]: 

1637 """Check existence of refs that are not known to datastore. 

1638 

1639 Parameters 

1640 ---------- 

1641 refs : iterable of `DatasetRef` 

1642 The datasets to be checked. These are assumed not to be known 

1643 to datastore. 

1644 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1645 Optional mapping of datastore artifact to existence. Updated by 

1646 this method with details of all artifacts tested. Can be `None` 

1647 if the caller is not interested. 

1648 

1649 Returns 

1650 ------- 

1651 existence : `dict` of [`DatasetRef`, `bool`] 

1652 Mapping from dataset to boolean indicating existence. 

1653 """ 

1654 dataset_existence: dict[DatasetRef, bool] = {} 

1655 if not self.trustGetRequest: 

1656 # Must assume these do not exist 

1657 for ref in refs: 

1658 dataset_existence[ref] = False 

1659 else: 

1660 log.debug( 

1661 "%d datasets were not known to datastore during initial existence check.", 

1662 len(refs), 

1663 ) 

1664 

1665 # Construct data structure identical to that returned 

1666 # by _get_stored_records_associated_with_refs() but using 

1667 # guessed names. 

1668 records = {} 

1669 id_to_ref = {} 

1670 for missing_ref in refs: 

1671 expected = self._get_expected_dataset_locations_info(missing_ref) 

1672 dataset_id = missing_ref.id 

1673 records[dataset_id] = [info for _, info in expected] 

1674 id_to_ref[dataset_id] = missing_ref 

1675 

1676 dataset_existence.update( 

1677 self._process_mexists_records( 

1678 id_to_ref, 

1679 records, 

1680 False, 

1681 artifact_existence=artifact_existence, 

1682 ) 

1683 ) 

1684 

1685 return dataset_existence 

1686 

1687 def exists(self, ref: DatasetRef) -> bool: 

1688 """Check if the dataset exists in the datastore. 

1689 

1690 Parameters 

1691 ---------- 

1692 ref : `DatasetRef` 

1693 Reference to the required dataset. 

1694 

1695 Returns 

1696 ------- 

1697 exists : `bool` 

1698 `True` if the entity exists in the `Datastore`. 

1699 

1700 Notes 

1701 ----- 

1702 The local cache is checked as a proxy for existence in the remote 

1703 object store. It is possible that another process on a different 

1704 compute node could remove the file from the object store even 

1705 though it is present in the local cache. 

1706 """ 

1707 ref = self._cast_storage_class(ref) 

1708 fileLocations = self._get_dataset_locations_info(ref) 

1709 

1710 # if we are being asked to trust that registry might not be correct 

1711 # we ask for the expected locations and check them explicitly 

1712 if not fileLocations: 

1713 if not self.trustGetRequest: 

1714 return False 

1715 

1716 # First check the cache. If it is not found we must check 

1717 # the datastore itself. Assume that any component in the cache 

1718 # means that the dataset does exist somewhere. 

1719 if self.cacheManager.known_to_cache(ref): 

1720 return True 

1721 

1722 # When we are guessing a dataset location we can not check 

1723 # for the existence of every component since we can not 

1724 # know if every component was written. Instead we check 

1725 # for the existence of any of the expected locations. 

1726 for location, _ in self._get_expected_dataset_locations_info(ref): 

1727 if self._artifact_exists(location): 

1728 return True 

1729 return False 

1730 

1731 # All listed artifacts must exist. 

1732 for location, storedFileInfo in fileLocations: 

1733 # Checking in cache needs the component ref. 

1734 check_ref = ref 

1735 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1736 check_ref = ref.makeComponentRef(component) 

1737 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1738 continue 

1739 

1740 if not self._artifact_exists(location): 

1741 return False 

1742 

1743 return True 

1744 

1745 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1746 """Return URIs associated with dataset. 

1747 

1748 Parameters 

1749 ---------- 

1750 ref : `DatasetRef` 

1751 Reference to the required dataset. 

1752 predict : `bool`, optional 

1753 If the datastore does not know about the dataset, should it 

1754 return a predicted URI or not? 

1755 

1756 Returns 

1757 ------- 

1758 uris : `DatasetRefURIs` 

1759 The URI to the primary artifact associated with this dataset (if 

1760 the dataset was disassembled within the datastore this may be 

1761 `None`), and the URIs to any components associated with the dataset 

1762 artifact. (can be empty if there are no components). 

1763 """ 

1764 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1765 return many[ref] 

1766 

1767 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1768 """URI to the Dataset. 

1769 

1770 Parameters 

1771 ---------- 

1772 ref : `DatasetRef` 

1773 Reference to the required Dataset. 

1774 predict : `bool` 

1775 If `True`, allow URIs to be returned of datasets that have not 

1776 been written. 

1777 

1778 Returns 

1779 ------- 

1780 uri : `str` 

1781 URI pointing to the dataset within the datastore. If the 

1782 dataset does not exist in the datastore, and if ``predict`` is 

1783 `True`, the URI will be a prediction and will include a URI 

1784 fragment "#predicted". 

1785 If the datastore does not have entities that relate well 

1786 to the concept of a URI the returned URI will be 

1787 descriptive. The returned URI is not guaranteed to be obtainable. 

1788 

1789 Raises 

1790 ------ 

1791 FileNotFoundError 

1792 Raised if a URI has been requested for a dataset that does not 

1793 exist and guessing is not allowed. 

1794 RuntimeError 

1795 Raised if a request is made for a single URI but multiple URIs 

1796 are associated with this dataset. 

1797 

1798 Notes 

1799 ----- 

1800 When a predicted URI is requested an attempt will be made to form 

1801 a reasonable URI based on file templates and the expected formatter. 

1802 """ 

1803 primary, components = self.getURIs(ref, predict) 

1804 if primary is None or components: 

1805 raise RuntimeError( 

1806 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1807 ) 

1808 return primary 

1809 

1810 def _predict_URIs( 

1811 self, 

1812 ref: DatasetRef, 

1813 ) -> DatasetRefURIs: 

1814 """Predict the URIs of a dataset ref. 

1815 

1816 Parameters 

1817 ---------- 

1818 ref : `DatasetRef` 

1819 Reference to the required Dataset. 

1820 

1821 Returns 

1822 ------- 

1823 URI : DatasetRefUris 

1824 Primary and component URIs. URIs will contain a URI fragment 

1825 "#predicted". 

1826 """ 

1827 uris = DatasetRefURIs() 

1828 

1829 if self.composites.shouldBeDisassembled(ref): 

1830 for component, _ in ref.datasetType.storageClass.components.items(): 

1831 comp_ref = ref.makeComponentRef(component) 

1832 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1833 

1834 # Add the "#predicted" URI fragment to indicate this is a 

1835 # guess 

1836 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1837 

1838 else: 

1839 location, _ = self._determine_put_formatter_location(ref) 

1840 

1841 # Add the "#predicted" URI fragment to indicate this is a guess 

1842 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1843 

1844 return uris 

1845 

1846 def getManyURIs( 

1847 self, 

1848 refs: Iterable[DatasetRef], 

1849 predict: bool = False, 

1850 allow_missing: bool = False, 

1851 ) -> dict[DatasetRef, DatasetRefURIs]: 

1852 # Docstring inherited 

1853 

1854 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1855 

1856 records = self._get_stored_records_associated_with_refs(refs) 

1857 records_keys = records.keys() 

1858 

1859 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1860 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1861 

1862 # Have to handle trustGetRequest mode by checking for the existence 

1863 # of the missing refs on disk. 

1864 if missing_refs: 

1865 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1866 really_missing = set() 

1867 not_missing = set() 

1868 for ref, exists in dataset_existence.items(): 

1869 if exists: 

1870 not_missing.add(ref) 

1871 else: 

1872 really_missing.add(ref) 

1873 

1874 if not_missing: 

1875 # Need to recalculate the missing/existing split. 

1876 existing_refs = existing_refs + tuple(not_missing) 

1877 missing_refs = tuple(really_missing) 

1878 

1879 for ref in missing_refs: 

1880 # if this has never been written then we have to guess 

1881 if not predict: 

1882 if not allow_missing: 

1883 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1884 else: 

1885 uris[ref] = self._predict_URIs(ref) 

1886 

1887 for ref in existing_refs: 

1888 file_infos = records[ref.id] 

1889 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1890 uris[ref] = self._locations_to_URI(ref, file_locations) 

1891 

1892 return uris 

1893 

1894 def _locations_to_URI( 

1895 self, 

1896 ref: DatasetRef, 

1897 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1898 ) -> DatasetRefURIs: 

1899 """Convert one or more file locations associated with a DatasetRef 

1900 to a DatasetRefURIs. 

1901 

1902 Parameters 

1903 ---------- 

1904 ref : `DatasetRef` 

1905 Reference to the dataset. 

1906 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1907 Each item in the sequence is the location of the dataset within the 

1908 datastore and stored information about the file and its formatter. 

1909 If there is only one item in the sequence then it is treated as the 

1910 primary URI. If there is more than one item then they are treated 

1911 as component URIs. If there are no items then an error is raised 

1912 unless ``self.trustGetRequest`` is `True`. 

1913 

1914 Returns 

1915 ------- 

1916 uris: DatasetRefURIs 

1917 Represents the primary URI or component URIs described by the 

1918 inputs. 

1919 

1920 Raises 

1921 ------ 

1922 RuntimeError 

1923 If no file locations are passed in and ``self.trustGetRequest`` is 

1924 `False`. 

1925 FileNotFoundError 

1926 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1927 is `False`. 

1928 RuntimeError 

1929 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1930 unexpected). 

1931 """ 

1932 guessing = False 

1933 uris = DatasetRefURIs() 

1934 

1935 if not file_locations: 

1936 if not self.trustGetRequest: 

1937 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1938 file_locations = self._get_expected_dataset_locations_info(ref) 

1939 guessing = True 

1940 

1941 if len(file_locations) == 1: 

1942 # No disassembly so this is the primary URI 

1943 uris.primaryURI = file_locations[0][0].uri 

1944 if guessing and not uris.primaryURI.exists(): 

1945 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1946 else: 

1947 for location, file_info in file_locations: 

1948 if file_info.component is None: 

1949 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1950 if guessing and not location.uri.exists(): 

1951 # If we are trusting then it is entirely possible for 

1952 # some components to be missing. In that case we skip 

1953 # to the next component. 

1954 if self.trustGetRequest: 

1955 continue 

1956 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1957 uris.componentURIs[file_info.component] = location.uri 

1958 

1959 return uris 

1960 

1961 def retrieveArtifacts( 

1962 self, 

1963 refs: Iterable[DatasetRef], 

1964 destination: ResourcePath, 

1965 transfer: str = "auto", 

1966 preserve_path: bool = True, 

1967 overwrite: bool = False, 

1968 ) -> list[ResourcePath]: 

1969 """Retrieve the file artifacts associated with the supplied refs. 

1970 

1971 Parameters 

1972 ---------- 

1973 refs : iterable of `DatasetRef` 

1974 The datasets for which file artifacts are to be retrieved. 

1975 A single ref can result in multiple files. The refs must 

1976 be resolved. 

1977 destination : `lsst.resources.ResourcePath` 

1978 Location to write the file artifacts. 

1979 transfer : `str`, optional 

1980 Method to use to transfer the artifacts. Must be one of the options 

1981 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1982 "move" is not allowed. 

1983 preserve_path : `bool`, optional 

1984 If `True` the full path of the file artifact within the datastore 

1985 is preserved. If `False` the final file component of the path 

1986 is used. 

1987 overwrite : `bool`, optional 

1988 If `True` allow transfers to overwrite existing files at the 

1989 destination. 

1990 

1991 Returns 

1992 ------- 

1993 targets : `list` of `lsst.resources.ResourcePath` 

1994 URIs of file artifacts in destination location. Order is not 

1995 preserved. 

1996 """ 

1997 if not destination.isdir(): 

1998 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1999 

2000 if transfer == "move": 

2001 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

2002 

2003 # Source -> Destination 

2004 # This also helps filter out duplicate DatasetRef in the request 

2005 # that will map to the same underlying file transfer. 

2006 to_transfer: dict[ResourcePath, ResourcePath] = {} 

2007 

2008 for ref in refs: 

2009 locations = self._get_dataset_locations_info(ref) 

2010 for location, _ in locations: 

2011 source_uri = location.uri 

2012 target_path: ResourcePathExpression 

2013 if preserve_path: 

2014 target_path = location.pathInStore 

2015 if target_path.isabs(): 

2016 # This is an absolute path to an external file. 

2017 # Use the full path. 

2018 target_path = target_path.relativeToPathRoot 

2019 else: 

2020 target_path = source_uri.basename() 

2021 target_uri = destination.join(target_path) 

2022 to_transfer[source_uri] = target_uri 

2023 

2024 # In theory can now parallelize the transfer 

2025 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

2026 for source_uri, target_uri in to_transfer.items(): 

2027 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

2028 

2029 return list(to_transfer.values()) 

2030 

2031 def get( 

2032 self, 

2033 ref: DatasetRef, 

2034 parameters: Mapping[str, Any] | None = None, 

2035 storageClass: StorageClass | str | None = None, 

2036 ) -> Any: 

2037 """Load an InMemoryDataset from the store. 

2038 

2039 Parameters 

2040 ---------- 

2041 ref : `DatasetRef` 

2042 Reference to the required Dataset. 

2043 parameters : `dict` 

2044 `StorageClass`-specific parameters that specify, for example, 

2045 a slice of the dataset to be loaded. 

2046 storageClass : `StorageClass` or `str`, optional 

2047 The storage class to be used to override the Python type 

2048 returned by this method. By default the returned type matches 

2049 the dataset type definition for this dataset. Specifying a 

2050 read `StorageClass` can force a different type to be returned. 

2051 This type must be compatible with the original type. 

2052 

2053 Returns 

2054 ------- 

2055 inMemoryDataset : `object` 

2056 Requested dataset or slice thereof as an InMemoryDataset. 

2057 

2058 Raises 

2059 ------ 

2060 FileNotFoundError 

2061 Requested dataset can not be retrieved. 

2062 TypeError 

2063 Return value from formatter has unexpected type. 

2064 ValueError 

2065 Formatter failed to process the dataset. 

2066 """ 

2067 # Supplied storage class for the component being read is either 

2068 # from the ref itself or some an override if we want to force 

2069 # type conversion. 

2070 if storageClass is not None: 

2071 ref = ref.overrideStorageClass(storageClass) 

2072 refStorageClass = ref.datasetType.storageClass 

2073 

2074 allGetInfo = self._prepare_for_get(ref, parameters) 

2075 refComponent = ref.datasetType.component() 

2076 

2077 # Create mapping from component name to related info 

2078 allComponents = {i.component: i for i in allGetInfo} 

2079 

2080 # By definition the dataset is disassembled if we have more 

2081 # than one record for it. 

2082 isDisassembled = len(allGetInfo) > 1 

2083 

2084 # Look for the special case where we are disassembled but the 

2085 # component is a derived component that was not written during 

2086 # disassembly. For this scenario we need to check that the 

2087 # component requested is listed as a derived component for the 

2088 # composite storage class 

2089 isDisassembledReadOnlyComponent = False 

2090 if isDisassembled and refComponent: 

2091 # The composite storage class should be accessible through 

2092 # the component dataset type 

2093 compositeStorageClass = ref.datasetType.parentStorageClass 

2094 

2095 # In the unlikely scenario where the composite storage 

2096 # class is not known, we can only assume that this is a 

2097 # normal component. If that assumption is wrong then the 

2098 # branch below that reads a persisted component will fail 

2099 # so there is no need to complain here. 

2100 if compositeStorageClass is not None: 

2101 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2102 

2103 if isDisassembled and not refComponent: 

2104 # This was a disassembled dataset spread over multiple files 

2105 # and we need to put them all back together again. 

2106 # Read into memory and then assemble 

2107 

2108 # Check that the supplied parameters are suitable for the type read 

2109 refStorageClass.validateParameters(parameters) 

2110 

2111 # We want to keep track of all the parameters that were not used 

2112 # by formatters. We assume that if any of the component formatters 

2113 # use a parameter that we do not need to apply it again in the 

2114 # assembler. 

2115 usedParams = set() 

2116 

2117 components: dict[str, Any] = {} 

2118 for getInfo in allGetInfo: 

2119 # assemblerParams are parameters not understood by the 

2120 # associated formatter. 

2121 usedParams.update(set(getInfo.formatterParams)) 

2122 

2123 component = getInfo.component 

2124 

2125 if component is None: 

2126 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2127 

2128 # We do not want the formatter to think it's reading 

2129 # a component though because it is really reading a 

2130 # standalone dataset -- always tell reader it is not a 

2131 # component. 

2132 components[component] = self._read_artifact_into_memory( 

2133 getInfo, ref.makeComponentRef(component), isComponent=False 

2134 ) 

2135 

2136 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2137 

2138 # Any unused parameters will have to be passed to the assembler 

2139 if parameters: 

2140 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2141 else: 

2142 unusedParams = {} 

2143 

2144 # Process parameters 

2145 return ref.datasetType.storageClass.delegate().handleParameters( 

2146 inMemoryDataset, parameters=unusedParams 

2147 ) 

2148 

2149 elif isDisassembledReadOnlyComponent: 

2150 compositeStorageClass = ref.datasetType.parentStorageClass 

2151 if compositeStorageClass is None: 

2152 raise RuntimeError( 

2153 f"Unable to retrieve derived component '{refComponent}' since" 

2154 "no composite storage class is available." 

2155 ) 

2156 

2157 if refComponent is None: 

2158 # Mainly for mypy 

2159 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2160 

2161 # Assume that every derived component can be calculated by 

2162 # forwarding the request to a single read/write component. 

2163 # Rather than guessing which rw component is the right one by 

2164 # scanning each for a derived component of the same name, 

2165 # we ask the storage class delegate directly which one is best to 

2166 # use. 

2167 compositeDelegate = compositeStorageClass.delegate() 

2168 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2169 refComponent, set(allComponents) 

2170 ) 

2171 

2172 # Select the relevant component 

2173 rwInfo = allComponents[forwardedComponent] 

2174 

2175 # For now assume that read parameters are validated against 

2176 # the real component and not the requested component 

2177 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2178 forwardedStorageClass.validateParameters(parameters) 

2179 

2180 # The reference to use for the caching must refer to the forwarded 

2181 # component and not the derived component. 

2182 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2183 

2184 # Unfortunately the FileDescriptor inside the formatter will have 

2185 # the wrong write storage class so we need to create a new one 

2186 # given the immutability constraint. 

2187 writeStorageClass = rwInfo.info.storageClass 

2188 

2189 # We may need to put some thought into parameters for read 

2190 # components but for now forward them on as is 

2191 readFormatter = type(rwInfo.formatter)( 

2192 FileDescriptor( 

2193 rwInfo.location, 

2194 readStorageClass=refStorageClass, 

2195 storageClass=writeStorageClass, 

2196 parameters=parameters, 

2197 ), 

2198 ref.dataId, 

2199 ) 

2200 

2201 # The assembler can not receive any parameter requests for a 

2202 # derived component at this time since the assembler will 

2203 # see the storage class of the derived component and those 

2204 # parameters will have to be handled by the formatter on the 

2205 # forwarded storage class. 

2206 assemblerParams: dict[str, Any] = {} 

2207 

2208 # Need to created a new info that specifies the derived 

2209 # component and associated storage class 

2210 readInfo = DatastoreFileGetInformation( 

2211 rwInfo.location, 

2212 readFormatter, 

2213 rwInfo.info, 

2214 assemblerParams, 

2215 {}, 

2216 refComponent, 

2217 refStorageClass, 

2218 ) 

2219 

2220 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2221 

2222 else: 

2223 # Single file request or component from that composite file 

2224 for lookup in (refComponent, None): 

2225 if lookup in allComponents: 

2226 getInfo = allComponents[lookup] 

2227 break 

2228 else: 

2229 raise FileNotFoundError( 

2230 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2231 ) 

2232 

2233 # Do not need the component itself if already disassembled 

2234 if isDisassembled: 

2235 isComponent = False 

2236 else: 

2237 isComponent = getInfo.component is not None 

2238 

2239 # For a component read of a composite we want the cache to 

2240 # be looking at the composite ref itself. 

2241 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2242 

2243 # For a disassembled component we can validate parametersagainst 

2244 # the component storage class directly 

2245 if isDisassembled: 

2246 refStorageClass.validateParameters(parameters) 

2247 else: 

2248 # For an assembled composite this could be a derived 

2249 # component derived from a real component. The validity 

2250 # of the parameters is not clear. For now validate against 

2251 # the composite storage class 

2252 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2253 

2254 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2255 

2256 @transactional 

2257 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2258 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2259 

2260 Parameters 

2261 ---------- 

2262 inMemoryDataset : `object` 

2263 The dataset to store. 

2264 ref : `DatasetRef` 

2265 Reference to the associated Dataset. 

2266 

2267 Raises 

2268 ------ 

2269 TypeError 

2270 Supplied object and storage class are inconsistent. 

2271 DatasetTypeNotSupportedError 

2272 The associated `DatasetType` is not handled by this datastore. 

2273 

2274 Notes 

2275 ----- 

2276 If the datastore is configured to reject certain dataset types it 

2277 is possible that the put will fail and raise a 

2278 `DatasetTypeNotSupportedError`. The main use case for this is to 

2279 allow `ChainedDatastore` to put to multiple datastores without 

2280 requiring that every datastore accepts the dataset. 

2281 """ 

2282 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2283 # doDisassembly = True 

2284 

2285 artifacts = [] 

2286 if doDisassembly: 

2287 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2288 if components is None: 

2289 raise RuntimeError( 

2290 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2291 f"with storage class {ref.datasetType.storageClass.name} " 

2292 "is configured to be disassembled, but cannot be." 

2293 ) 

2294 for component, componentInfo in components.items(): 

2295 # Don't recurse because we want to take advantage of 

2296 # bulk insert -- need a new DatasetRef that refers to the 

2297 # same dataset_id but has the component DatasetType 

2298 # DatasetType does not refer to the types of components 

2299 # So we construct one ourselves. 

2300 compRef = ref.makeComponentRef(component) 

2301 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2302 artifacts.append((compRef, storedInfo)) 

2303 else: 

2304 # Write the entire thing out 

2305 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2306 artifacts.append((ref, storedInfo)) 

2307 

2308 self._register_datasets(artifacts) 

2309 

2310 @transactional 

2311 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2312 # At this point can safely remove these datasets from the cache 

2313 # to avoid confusion later on. If they are not trashed later 

2314 # the cache will simply be refilled. 

2315 self.cacheManager.remove_from_cache(ref) 

2316 

2317 # If we are in trust mode there will be nothing to move to 

2318 # the trash table and we will have to try to delete the file 

2319 # immediately. 

2320 if self.trustGetRequest: 

2321 # Try to keep the logic below for a single file trash. 

2322 if isinstance(ref, DatasetRef): 

2323 refs = {ref} 

2324 else: 

2325 # Will recreate ref at the end of this branch. 

2326 refs = set(ref) 

2327 

2328 # Determine which datasets are known to datastore directly. 

2329 id_to_ref = {ref.id: ref for ref in refs} 

2330 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2331 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2332 

2333 missing = refs - existing_refs 

2334 if missing: 

2335 # Do an explicit existence check on these refs. 

2336 # We only care about the artifacts at this point and not 

2337 # the dataset existence. 

2338 artifact_existence: dict[ResourcePath, bool] = {} 

2339 _ = self.mexists(missing, artifact_existence) 

2340 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2341 

2342 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2343 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2344 for uri in uris: 

2345 try: 

2346 uri.remove() 

2347 except Exception as e: 

2348 if ignore_errors: 

2349 log.debug("Artifact %s could not be removed: %s", uri, e) 

2350 continue 

2351 raise 

2352 

2353 # There is no point asking the code below to remove refs we 

2354 # know are missing so update it with the list of existing 

2355 # records. Try to retain one vs many logic. 

2356 if not existing_refs: 

2357 # Nothing more to do since none of the datasets were 

2358 # known to the datastore record table. 

2359 return 

2360 ref = list(existing_refs) 

2361 if len(ref) == 1: 

2362 ref = ref[0] 

2363 

2364 # Get file metadata and internal metadata 

2365 if not isinstance(ref, DatasetRef): 

2366 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2367 # Assumed to be an iterable of refs so bulk mode enabled. 

2368 try: 

2369 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2370 except Exception as e: 

2371 if ignore_errors: 

2372 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2373 else: 

2374 raise 

2375 return 

2376 

2377 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2378 

2379 fileLocations = self._get_dataset_locations_info(ref) 

2380 

2381 if not fileLocations: 

2382 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2383 if ignore_errors: 

2384 log.warning(err_msg) 

2385 return 

2386 else: 

2387 raise FileNotFoundError(err_msg) 

2388 

2389 for location, _ in fileLocations: 

2390 if not self._artifact_exists(location): 

2391 err_msg = ( 

2392 f"Dataset is known to datastore {self.name} but " 

2393 f"associated artifact ({location.uri}) is missing" 

2394 ) 

2395 if ignore_errors: 

2396 log.warning(err_msg) 

2397 return 

2398 else: 

2399 raise FileNotFoundError(err_msg) 

2400 

2401 # Mark dataset as trashed 

2402 try: 

2403 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2404 except Exception as e: 

2405 if ignore_errors: 

2406 log.warning( 

2407 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2408 "but encountered an error: %s", 

2409 ref, 

2410 self.name, 

2411 e, 

2412 ) 

2413 pass 

2414 else: 

2415 raise 

2416 

2417 @transactional 

2418 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2419 """Remove all datasets from the trash. 

2420 

2421 Parameters 

2422 ---------- 

2423 ignore_errors : `bool` 

2424 If `True` return without error even if something went wrong. 

2425 Problems could occur if another process is simultaneously trying 

2426 to delete. 

2427 """ 

2428 log.debug("Emptying trash in datastore %s", self.name) 

2429 

2430 # Context manager will empty trash iff we finish it without raising. 

2431 # It will also automatically delete the relevant rows from the 

2432 # trash table and the records table. 

2433 with self.bridge.emptyTrash( 

2434 self._table, record_class=StoredFileInfo, record_column="path" 

2435 ) as trash_data: 

2436 # Removing the artifacts themselves requires that the files are 

2437 # not also associated with refs that are not to be trashed. 

2438 # Therefore need to do a query with the file paths themselves 

2439 # and return all the refs associated with them. Can only delete 

2440 # a file if the refs to be trashed are the only refs associated 

2441 # with the file. 

2442 # This requires multiple copies of the trashed items 

2443 trashed, artifacts_to_keep = trash_data 

2444 

2445 if artifacts_to_keep is None: 

2446 # The bridge is not helping us so have to work it out 

2447 # ourselves. This is not going to be as efficient. 

2448 trashed = list(trashed) 

2449 

2450 # The instance check is for mypy since up to this point it 

2451 # does not know the type of info. 

2452 path_map = self._refs_associated_with_artifacts( 

2453 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2454 ) 

2455 

2456 for ref, info in trashed: 

2457 # Mypy needs to know this is not the base class 

2458 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2459 

2460 path_map[info.path].remove(ref.id) 

2461 if not path_map[info.path]: 

2462 del path_map[info.path] 

2463 

2464 artifacts_to_keep = set(path_map) 

2465 

2466 for ref, info in trashed: 

2467 # Should not happen for this implementation but need 

2468 # to keep mypy happy. 

2469 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2470 

2471 # Mypy needs to know this is not the base class 

2472 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2473 

2474 if info.path in artifacts_to_keep: 

2475 # This is a multi-dataset artifact and we are not 

2476 # removing all associated refs. 

2477 continue 

2478 

2479 # Only trashed refs still known to datastore will be returned. 

2480 location = info.file_location(self.locationFactory) 

2481 

2482 # Point of no return for this artifact 

2483 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2484 try: 

2485 self._delete_artifact(location) 

2486 except FileNotFoundError: 

2487 # If the file itself has been deleted there is nothing 

2488 # we can do about it. It is possible that trash has 

2489 # been run in parallel in another process or someone 

2490 # decided to delete the file. It is unlikely to come 

2491 # back and so we should still continue with the removal 

2492 # of the entry from the trash table. It is also possible 

2493 # we removed it in a previous iteration if it was 

2494 # a multi-dataset artifact. The delete artifact method 

2495 # will log a debug message in this scenario. 

2496 # Distinguishing file missing before trash started and 

2497 # file already removed previously as part of this trash 

2498 # is not worth the distinction with regards to potential 

2499 # memory cost. 

2500 pass 

2501 except Exception as e: 

2502 if ignore_errors: 

2503 # Use a debug message here even though it's not 

2504 # a good situation. In some cases this can be 

2505 # caused by a race between user A and user B 

2506 # and neither of them has permissions for the 

2507 # other's files. Butler does not know about users 

2508 # and trash has no idea what collections these 

2509 # files were in (without guessing from a path). 

2510 log.debug( 

2511 "Encountered error removing artifact %s from datastore %s: %s", 

2512 location.uri, 

2513 self.name, 

2514 e, 

2515 ) 

2516 else: 

2517 raise 

2518 

2519 @transactional 

2520 def transfer_from( 

2521 self, 

2522 source_datastore: Datastore, 

2523 refs: Iterable[DatasetRef], 

2524 transfer: str = "auto", 

2525 artifact_existence: dict[ResourcePath, bool] | None = None, 

2526 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2527 # Docstring inherited 

2528 if type(self) is not type(source_datastore): 

2529 raise TypeError( 

2530 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2531 f"source datastore ({type(source_datastore)})." 

2532 ) 

2533 

2534 # Be explicit for mypy 

2535 if not isinstance(source_datastore, FileDatastore): 

2536 raise TypeError( 

2537 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2538 f" {type(source_datastore)}" 

2539 ) 

2540 

2541 # Stop early if "direct" transfer mode is requested. That would 

2542 # require that the URI inside the source datastore should be stored 

2543 # directly in the target datastore, which seems unlikely to be useful 

2544 # since at any moment the source datastore could delete the file. 

2545 if transfer in ("direct", "split"): 

2546 raise ValueError( 

2547 f"Can not transfer from a source datastore using {transfer} mode since" 

2548 " those files are controlled by the other datastore." 

2549 ) 

2550 

2551 # Empty existence lookup if none given. 

2552 if artifact_existence is None: 

2553 artifact_existence = {} 

2554 

2555 # We will go through the list multiple times so must convert 

2556 # generators to lists. 

2557 refs = list(refs) 

2558 

2559 # In order to handle disassembled composites the code works 

2560 # at the records level since it can assume that internal APIs 

2561 # can be used. 

2562 # - If the record already exists in the destination this is assumed 

2563 # to be okay. 

2564 # - If there is no record but the source and destination URIs are 

2565 # identical no transfer is done but the record is added. 

2566 # - If the source record refers to an absolute URI currently assume 

2567 # that that URI should remain absolute and will be visible to the 

2568 # destination butler. May need to have a flag to indicate whether 

2569 # the dataset should be transferred. This will only happen if 

2570 # the detached Butler has had a local ingest. 

2571 

2572 # What we really want is all the records in the source datastore 

2573 # associated with these refs. Or derived ones if they don't exist 

2574 # in the source. 

2575 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2576 

2577 # The source dataset_ids are the keys in these records 

2578 source_ids = set(source_records) 

2579 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2580 

2581 requested_ids = {ref.id for ref in refs} 

2582 missing_ids = requested_ids - source_ids 

2583 

2584 # Missing IDs can be okay if that datastore has allowed 

2585 # gets based on file existence. Should we transfer what we can 

2586 # or complain about it and warn? 

2587 if missing_ids and not source_datastore.trustGetRequest: 

2588 raise ValueError( 

2589 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2590 ) 

2591 

2592 # Need to map these missing IDs to a DatasetRef so we can guess 

2593 # the details. 

2594 if missing_ids: 

2595 log.info( 

2596 "Number of expected datasets missing from source datastore records: %d out of %d", 

2597 len(missing_ids), 

2598 len(requested_ids), 

2599 ) 

2600 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2601 

2602 # This should be chunked in case we end up having to check 

2603 # the file store since we need some log output to show 

2604 # progress. 

2605 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2606 records = {} 

2607 for missing in missing_ids_chunk: 

2608 # Ask the source datastore where the missing artifacts 

2609 # should be. An execution butler might not know about the 

2610 # artifacts even if they are there. 

2611 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2612 records[missing] = [info for _, info in expected] 

2613 

2614 # Call the mexist helper method in case we have not already 

2615 # checked these artifacts such that artifact_existence is 

2616 # empty. This allows us to benefit from parallelism. 

2617 # datastore.mexists() itself does not give us access to the 

2618 # derived datastore record. 

2619 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2620 ref_exists = source_datastore._process_mexists_records( 

2621 id_to_ref, records, False, artifact_existence=artifact_existence 

2622 ) 

2623 

2624 # Now go through the records and propagate the ones that exist. 

2625 location_factory = source_datastore.locationFactory 

2626 for missing, record_list in records.items(): 

2627 # Skip completely if the ref does not exist. 

2628 ref = id_to_ref[missing] 

2629 if not ref_exists[ref]: 

2630 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2631 continue 

2632 # Check for file artifact to decide which parts of a 

2633 # disassembled composite do exist. If there is only a 

2634 # single record we don't even need to look because it can't 

2635 # be a composite and must exist. 

2636 if len(record_list) == 1: 

2637 dataset_records = record_list 

2638 else: 

2639 dataset_records = [ 

2640 record 

2641 for record in record_list 

2642 if artifact_existence[record.file_location(location_factory).uri] 

2643 ] 

2644 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2645 

2646 # Rely on source_records being a defaultdict. 

2647 source_records[missing].extend(dataset_records) 

2648 

2649 # See if we already have these records 

2650 target_records = self._get_stored_records_associated_with_refs(refs) 

2651 

2652 # The artifacts to register 

2653 artifacts = [] 

2654 

2655 # Refs that already exist 

2656 already_present = [] 

2657 

2658 # Refs that were rejected by this datastore. 

2659 rejected = set() 

2660 

2661 # Refs that were transferred successfully. 

2662 accepted = set() 

2663 

2664 # Record each time we have done a "direct" transfer. 

2665 direct_transfers = [] 

2666 

2667 # Now can transfer the artifacts 

2668 for ref in refs: 

2669 if not self.constraints.isAcceptable(ref): 

2670 # This datastore should not be accepting this dataset. 

2671 rejected.add(ref) 

2672 continue 

2673 

2674 accepted.add(ref) 

2675 

2676 if ref.id in target_records: 

2677 # Already have an artifact for this. 

2678 already_present.append(ref) 

2679 continue 

2680 

2681 # mypy needs to know these are always resolved refs 

2682 for info in source_records[ref.id]: 

2683 source_location = info.file_location(source_datastore.locationFactory) 

2684 target_location = info.file_location(self.locationFactory) 

2685 if source_location == target_location and not source_location.pathInStore.isabs(): 

2686 # Artifact is already in the target location. 

2687 # (which is how execution butler currently runs) 

2688 pass 

2689 else: 

2690 if target_location.pathInStore.isabs(): 

2691 # Just because we can see the artifact when running 

2692 # the transfer doesn't mean it will be generally 

2693 # accessible to a user of this butler. Need to decide 

2694 # what to do about an absolute path. 

2695 if transfer == "auto": 

2696 # For "auto" transfers we allow the absolute URI 

2697 # to be recorded in the target datastore. 

2698 direct_transfers.append(source_location) 

2699 else: 

2700 # The user is explicitly requesting a transfer 

2701 # even for an absolute URI. This requires us to 

2702 # calculate the target path. 

2703 template_ref = ref 

2704 if info.component: 

2705 template_ref = ref.makeComponentRef(info.component) 

2706 target_location = self._calculate_ingested_datastore_name( 

2707 source_location.uri, 

2708 template_ref, 

2709 ) 

2710 

2711 info = info.update(path=target_location.pathInStore.path) 

2712 

2713 # Need to transfer it to the new location. 

2714 # Assume we should always overwrite. If the artifact 

2715 # is there this might indicate that a previous transfer 

2716 # was interrupted but was not able to be rolled back 

2717 # completely (eg pre-emption) so follow Datastore default 

2718 # and overwrite. 

2719 target_location.uri.transfer_from( 

2720 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2721 ) 

2722 

2723 artifacts.append((ref, info)) 

2724 

2725 if direct_transfers: 

2726 log.info( 

2727 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2728 len(direct_transfers), 

2729 "" if len(direct_transfers) == 1 else "s", 

2730 ) 

2731 

2732 self._register_datasets(artifacts) 

2733 

2734 if already_present: 

2735 n_skipped = len(already_present) 

2736 log.info( 

2737 "Skipped transfer of %d dataset%s already present in datastore", 

2738 n_skipped, 

2739 "" if n_skipped == 1 else "s", 

2740 ) 

2741 

2742 return accepted, rejected 

2743 

2744 @transactional 

2745 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2746 # Docstring inherited. 

2747 refs = list(refs) 

2748 self.bridge.forget(refs) 

2749 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2750 

2751 def validateConfiguration( 

2752 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2753 ) -> None: 

2754 """Validate some of the configuration for this datastore. 

2755 

2756 Parameters 

2757 ---------- 

2758 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2759 Entities to test against this configuration. Can be differing 

2760 types. 

2761 logFailures : `bool`, optional 

2762 If `True`, output a log message for every validation error 

2763 detected. 

2764 

2765 Raises 

2766 ------ 

2767 DatastoreValidationError 

2768 Raised if there is a validation problem with a configuration. 

2769 All the problems are reported in a single exception. 

2770 

2771 Notes 

2772 ----- 

2773 This method checks that all the supplied entities have valid file 

2774 templates and also have formatters defined. 

2775 """ 

2776 templateFailed = None 

2777 try: 

2778 self.templates.validateTemplates(entities, logFailures=logFailures) 

2779 except FileTemplateValidationError as e: 

2780 templateFailed = str(e) 

2781 

2782 formatterFailed = [] 

2783 for entity in entities: 

2784 try: 

2785 self.formatterFactory.getFormatterClass(entity) 

2786 except KeyError as e: 

2787 formatterFailed.append(str(e)) 

2788 if logFailures: 

2789 log.critical("Formatter failure: %s", e) 

2790 

2791 if templateFailed or formatterFailed: 

2792 messages = [] 

2793 if templateFailed: 

2794 messages.append(templateFailed) 

2795 if formatterFailed: 

2796 messages.append(",".join(formatterFailed)) 

2797 msg = ";\n".join(messages) 

2798 raise DatastoreValidationError(msg) 

2799 

2800 def getLookupKeys(self) -> set[LookupKey]: 

2801 # Docstring is inherited from base class 

2802 return ( 

2803 self.templates.getLookupKeys() 

2804 | self.formatterFactory.getLookupKeys() 

2805 | self.constraints.getLookupKeys() 

2806 ) 

2807 

2808 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2809 # Docstring is inherited from base class 

2810 # The key can be valid in either formatters or templates so we can 

2811 # only check the template if it exists 

2812 if lookupKey in self.templates: 

2813 try: 

2814 self.templates[lookupKey].validateTemplate(entity) 

2815 except FileTemplateValidationError as e: 

2816 raise DatastoreValidationError(e) from e 

2817 

2818 def export( 

2819 self, 

2820 refs: Iterable[DatasetRef], 

2821 *, 

2822 directory: ResourcePathExpression | None = None, 

2823 transfer: str | None = "auto", 

2824 ) -> Iterable[FileDataset]: 

2825 # Docstring inherited from Datastore.export. 

2826 if transfer == "auto" and directory is None: 

2827 transfer = None 

2828 

2829 if transfer is not None and directory is None: 

2830 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2831 

2832 if transfer == "move": 

2833 raise TypeError("Can not export by moving files out of datastore.") 

2834 elif transfer == "direct": 

2835 # For an export, treat this as equivalent to None. We do not 

2836 # want an import to risk using absolute URIs to datasets owned 

2837 # by another datastore. 

2838 log.info("Treating 'direct' transfer mode as in-place export.") 

2839 transfer = None 

2840 

2841 # Force the directory to be a URI object 

2842 directoryUri: ResourcePath | None = None 

2843 if directory is not None: 

2844 directoryUri = ResourcePath(directory, forceDirectory=True) 

2845 

2846 if transfer is not None and directoryUri is not None and not directoryUri.exists(): 

2847 # mypy needs the second test 

2848 raise FileNotFoundError(f"Export location {directory} does not exist") 

2849 

2850 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2851 for ref in progress.wrap(refs, "Exporting dataset files"): 

2852 fileLocations = self._get_dataset_locations_info(ref) 

2853 if not fileLocations: 

2854 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2855 # For now we can not export disassembled datasets 

2856 if len(fileLocations) > 1: 

2857 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2858 location, storedFileInfo = fileLocations[0] 

2859 

2860 pathInStore = location.pathInStore.path 

2861 if transfer is None: 

2862 # TODO: do we also need to return the readStorageClass somehow? 

2863 # We will use the path in store directly. If this is an 

2864 # absolute URI, preserve it. 

2865 if location.pathInStore.isabs(): 

2866 pathInStore = str(location.uri) 

2867 elif transfer == "direct": 

2868 # Use full URIs to the remote store in the export 

2869 pathInStore = str(location.uri) 

2870 else: 

2871 # mypy needs help 

2872 assert directoryUri is not None, "directoryUri must be defined to get here" 

2873 storeUri = ResourcePath(location.uri) 

2874 

2875 # if the datastore has an absolute URI to a resource, we 

2876 # have two options: 

2877 # 1. Keep the absolute URI in the exported YAML 

2878 # 2. Allocate a new name in the local datastore and transfer 

2879 # it. 

2880 # For now go with option 2 

2881 if location.pathInStore.isabs(): 

2882 template = self.templates.getTemplate(ref) 

2883 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2884 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2885 

2886 exportUri = directoryUri.join(pathInStore) 

2887 exportUri.transfer_from(storeUri, transfer=transfer) 

2888 

2889 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2890 

2891 @staticmethod 

2892 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2893 """Compute the checksum of the supplied file. 

2894 

2895 Parameters 

2896 ---------- 

2897 uri : `lsst.resources.ResourcePath` 

2898 Name of resource to calculate checksum from. 

2899 algorithm : `str`, optional 

2900 Name of algorithm to use. Must be one of the algorithms supported 

2901 by :py:class`hashlib`. 

2902 block_size : `int` 

2903 Number of bytes to read from file at one time. 

2904 

2905 Returns 

2906 ------- 

2907 hexdigest : `str` 

2908 Hex digest of the file. 

2909 

2910 Notes 

2911 ----- 

2912 Currently returns None if the URI is for a remote resource. 

2913 """ 

2914 if algorithm not in hashlib.algorithms_guaranteed: 

2915 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2916 

2917 if not uri.isLocal: 

2918 return None 

2919 

2920 hasher = hashlib.new(algorithm) 

2921 

2922 with uri.as_local() as local_uri, open(local_uri.ospath, "rb") as f: 

2923 for chunk in iter(lambda: f.read(block_size), b""): 

2924 hasher.update(chunk) 

2925 

2926 return hasher.hexdigest() 

2927 

2928 def needs_expanded_data_ids( 

2929 self, 

2930 transfer: str | None, 

2931 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2932 ) -> bool: 

2933 # Docstring inherited. 

2934 # This _could_ also use entity to inspect whether the filename template 

2935 # involves placeholders other than the required dimensions for its 

2936 # dataset type, but that's not necessary for correctness; it just 

2937 # enables more optimizations (perhaps only in theory). 

2938 return transfer not in ("direct", None) 

2939 

2940 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2941 # Docstring inherited from the base class. 

2942 record_data = data.get(self.name) 

2943 if not record_data: 

2944 return 

2945 

2946 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records) 

2947 

2948 # TODO: Verify that there are no unexpected table names in the dict? 

2949 unpacked_records = [] 

2950 for dataset_data in record_data.records.values(): 

2951 records = dataset_data.get(self._table.name) 

2952 if records: 

2953 for info in records: 

2954 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2955 unpacked_records.append(info.to_record()) 

2956 if unpacked_records: 

2957 self._table.insert(*unpacked_records, transaction=self._transaction) 

2958 

2959 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2960 # Docstring inherited from the base class. 

2961 exported_refs = list(self._bridge.check(refs)) 

2962 ids = {ref.id for ref in exported_refs} 

2963 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

2964 for row in self._table.fetch(dataset_id=ids): 

2965 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2966 dataset_records = records.setdefault(info.dataset_id, {}) 

2967 dataset_records.setdefault(self._table.name, []).append(info) 

2968 

2969 record_data = DatastoreRecordData(records=records) 

2970 return {self.name: record_data} 

2971 

2972 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2973 # Docstring inherited from the base class. 

2974 self._retrieve_dataset_method = method 

2975 

2976 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2977 """Update dataset reference to use the storage class from registry.""" 

2978 if self._retrieve_dataset_method is None: 

2979 # We could raise an exception here but unit tests do not define 

2980 # this method. 

2981 return ref 

2982 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2983 if dataset_type is not None: 

2984 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2985 return ref