Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 8%

974 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-28 10:10 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from collections.abc import Callable, Iterable, Mapping, Sequence 

31from dataclasses import dataclass 

32from typing import TYPE_CHECKING, Any, ClassVar 

33 

34from lsst.daf.butler import ( 

35 CompositesMap, 

36 Config, 

37 DatasetId, 

38 DatasetRef, 

39 DatasetRefURIs, 

40 DatasetType, 

41 DatasetTypeNotSupportedError, 

42 Datastore, 

43 DatastoreCacheManager, 

44 DatastoreConfig, 

45 DatastoreDisabledCacheManager, 

46 DatastoreRecordData, 

47 DatastoreValidationError, 

48 FileDataset, 

49 FileDescriptor, 

50 FileTemplates, 

51 FileTemplateValidationError, 

52 Formatter, 

53 FormatterFactory, 

54 Location, 

55 LocationFactory, 

56 Progress, 

57 StorageClass, 

58 StoredDatastoreItemInfo, 

59 StoredFileInfo, 

60 ddl, 

61) 

62from lsst.daf.butler.core.repoRelocation import replaceRoot 

63from lsst.daf.butler.core.utils import transactional 

64from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

65from lsst.resources import ResourcePath, ResourcePathExpression 

66from lsst.utils.introspection import get_class_of, get_instance_of 

67from lsst.utils.iteration import chunk_iterable 

68 

69# For VERBOSE logging usage. 

70from lsst.utils.logging import VERBOSE, getLogger 

71from lsst.utils.timer import time_this 

72from sqlalchemy import BigInteger, String 

73 

74from ..registry.interfaces import FakeDatasetRef 

75from .genericDatastore import GenericBaseDatastore 

76 

77if TYPE_CHECKING: 

78 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

79 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

80 

81log = getLogger(__name__) 

82 

83 

84class _IngestPrepData(Datastore.IngestPrepData): 

85 """Helper class for FileDatastore ingest implementation. 

86 

87 Parameters 

88 ---------- 

89 datasets : `~collections.abc.Iterable` of `FileDataset` 

90 Files to be ingested by this datastore. 

91 """ 

92 

93 def __init__(self, datasets: Iterable[FileDataset]): 

94 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

95 self.datasets = datasets 

96 

97 

98@dataclass(frozen=True) 

99class DatastoreFileGetInformation: 

100 """Collection of useful parameters needed to retrieve a file from 

101 a Datastore. 

102 """ 

103 

104 location: Location 

105 """The location from which to read the dataset.""" 

106 

107 formatter: Formatter 

108 """The `Formatter` to use to deserialize the dataset.""" 

109 

110 info: StoredFileInfo 

111 """Stored information about this file and its formatter.""" 

112 

113 assemblerParams: Mapping[str, Any] 

114 """Parameters to use for post-processing the retrieved dataset.""" 

115 

116 formatterParams: Mapping[str, Any] 

117 """Parameters that were understood by the associated formatter.""" 

118 

119 component: str | None 

120 """The component to be retrieved (can be `None`).""" 

121 

122 readStorageClass: StorageClass 

123 """The `StorageClass` of the dataset being read.""" 

124 

125 

126class FileDatastore(GenericBaseDatastore): 

127 """Generic Datastore for file-based implementations. 

128 

129 Should always be sub-classed since key abstract methods are missing. 

130 

131 Parameters 

132 ---------- 

133 config : `DatastoreConfig` or `str` 

134 Configuration as either a `Config` object or URI to file. 

135 bridgeManager : `DatastoreRegistryBridgeManager` 

136 Object that manages the interface between `Registry` and datastores. 

137 butlerRoot : `str`, optional 

138 New datastore root to use to override the configuration value. 

139 

140 Raises 

141 ------ 

142 ValueError 

143 If root location does not exist and ``create`` is `False` in the 

144 configuration. 

145 """ 

146 

147 defaultConfigFile: ClassVar[str | None] = None 

148 """Path to configuration defaults. Accessed within the ``config`` resource 

149 or relative to a search path. Can be None if no defaults specified. 

150 """ 

151 

152 root: ResourcePath 

153 """Root directory URI of this `Datastore`.""" 

154 

155 locationFactory: LocationFactory 

156 """Factory for creating locations relative to the datastore root.""" 

157 

158 formatterFactory: FormatterFactory 

159 """Factory for creating instances of formatters.""" 

160 

161 templates: FileTemplates 

162 """File templates that can be used by this `Datastore`.""" 

163 

164 composites: CompositesMap 

165 """Determines whether a dataset should be disassembled on put.""" 

166 

167 defaultConfigFile = "datastores/fileDatastore.yaml" 

168 """Path to configuration defaults. Accessed within the ``config`` resource 

169 or relative to a search path. Can be None if no defaults specified. 

170 """ 

171 

172 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

173 """Callable that is used in trusted mode to retrieve registry definition 

174 of a named dataset type. 

175 """ 

176 

177 @classmethod 

178 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

179 """Set any filesystem-dependent config options for this Datastore to 

180 be appropriate for a new empty repository with the given root. 

181 

182 Parameters 

183 ---------- 

184 root : `str` 

185 URI to the root of the data repository. 

186 config : `Config` 

187 A `Config` to update. Only the subset understood by 

188 this component will be updated. Will not expand 

189 defaults. 

190 full : `Config` 

191 A complete config with all defaults expanded that can be 

192 converted to a `DatastoreConfig`. Read-only and will not be 

193 modified by this method. 

194 Repository-specific options that should not be obtained 

195 from defaults when Butler instances are constructed 

196 should be copied from ``full`` to ``config``. 

197 overwrite : `bool`, optional 

198 If `False`, do not modify a value in ``config`` if the value 

199 already exists. Default is always to overwrite with the provided 

200 ``root``. 

201 

202 Notes 

203 ----- 

204 If a keyword is explicitly defined in the supplied ``config`` it 

205 will not be overridden by this method if ``overwrite`` is `False`. 

206 This allows explicit values set in external configs to be retained. 

207 """ 

208 Config.updateParameters( 

209 DatastoreConfig, 

210 config, 

211 full, 

212 toUpdate={"root": root}, 

213 toCopy=("cls", ("records", "table")), 

214 overwrite=overwrite, 

215 ) 

216 

217 @classmethod 

218 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

219 return ddl.TableSpec( 

220 fields=[ 

221 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

222 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

223 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

224 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

225 # Use empty string to indicate no component 

226 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

227 # TODO: should checksum be Base64Bytes instead? 

228 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

229 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

230 ], 

231 unique=frozenset(), 

232 indexes=[ddl.IndexSpec("path")], 

233 ) 

234 

235 def __init__( 

236 self, 

237 config: DatastoreConfig | ResourcePathExpression, 

238 bridgeManager: DatastoreRegistryBridgeManager, 

239 butlerRoot: str | None = None, 

240 ): 

241 super().__init__(config, bridgeManager) 

242 if "root" not in self.config: 

243 raise ValueError("No root directory specified in configuration") 

244 

245 self._bridgeManager = bridgeManager 

246 

247 # Name ourselves either using an explicit name or a name 

248 # derived from the (unexpanded) root 

249 if "name" in self.config: 

250 self.name = self.config["name"] 

251 else: 

252 # We use the unexpanded root in the name to indicate that this 

253 # datastore can be moved without having to update registry. 

254 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

255 

256 # Support repository relocation in config 

257 # Existence of self.root is checked in subclass 

258 self.root = ResourcePath( 

259 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

260 ) 

261 

262 self.locationFactory = LocationFactory(self.root) 

263 self.formatterFactory = FormatterFactory() 

264 

265 # Now associate formatters with storage classes 

266 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

267 

268 # Read the file naming templates 

269 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

270 

271 # See if composites should be disassembled 

272 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

273 

274 tableName = self.config["records", "table"] 

275 try: 

276 # Storage of paths and formatters, keyed by dataset_id 

277 self._table = bridgeManager.opaque.register( 

278 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

279 ) 

280 # Interface to Registry. 

281 self._bridge = bridgeManager.register(self.name) 

282 except ReadOnlyDatabaseError: 

283 # If the database is read only and we just tried and failed to 

284 # create a table, it means someone is trying to create a read-only 

285 # butler client for an empty repo. That should be okay, as long 

286 # as they then try to get any datasets before some other client 

287 # creates the table. Chances are they'rejust validating 

288 # configuration. 

289 pass 

290 

291 # Determine whether checksums should be used - default to False 

292 self.useChecksum = self.config.get("checksum", False) 

293 

294 # Determine whether we can fall back to configuration if a 

295 # requested dataset is not known to registry 

296 self.trustGetRequest = self.config.get("trust_get_request", False) 

297 

298 # Create a cache manager 

299 self.cacheManager: AbstractDatastoreCacheManager 

300 if "cached" in self.config: 

301 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

302 else: 

303 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

304 

305 # Check existence and create directory structure if necessary 

306 if not self.root.exists(): 

307 if "create" not in self.config or not self.config["create"]: 

308 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

309 try: 

310 self.root.mkdir() 

311 except Exception as e: 

312 raise ValueError( 

313 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

314 ) from e 

315 

316 def __str__(self) -> str: 

317 return str(self.root) 

318 

319 @property 

320 def bridge(self) -> DatastoreRegistryBridge: 

321 return self._bridge 

322 

323 def _artifact_exists(self, location: Location) -> bool: 

324 """Check that an artifact exists in this datastore at the specified 

325 location. 

326 

327 Parameters 

328 ---------- 

329 location : `Location` 

330 Expected location of the artifact associated with this datastore. 

331 

332 Returns 

333 ------- 

334 exists : `bool` 

335 True if the location can be found, false otherwise. 

336 """ 

337 log.debug("Checking if resource exists: %s", location.uri) 

338 return location.uri.exists() 

339 

340 def _delete_artifact(self, location: Location) -> None: 

341 """Delete the artifact from the datastore. 

342 

343 Parameters 

344 ---------- 

345 location : `Location` 

346 Location of the artifact associated with this datastore. 

347 """ 

348 if location.pathInStore.isabs(): 

349 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

350 

351 try: 

352 location.uri.remove() 

353 except FileNotFoundError: 

354 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

355 raise 

356 except Exception as e: 

357 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

358 raise 

359 log.debug("Successfully deleted file: %s", location.uri) 

360 

361 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

362 # Docstring inherited from GenericBaseDatastore 

363 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)] 

364 self._table.insert(*records, transaction=self._transaction) 

365 

366 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]: 

367 # Docstring inherited from GenericBaseDatastore 

368 

369 # Look for the dataset_id -- there might be multiple matches 

370 # if we have disassembled the dataset. 

371 records = self._table.fetch(dataset_id=ref.id) 

372 return [StoredFileInfo.from_record(record) for record in records] 

373 

374 def _get_stored_records_associated_with_refs( 

375 self, refs: Iterable[DatasetIdRef] 

376 ) -> dict[DatasetId, list[StoredFileInfo]]: 

377 """Retrieve all records associated with the provided refs. 

378 

379 Parameters 

380 ---------- 

381 refs : iterable of `DatasetIdRef` 

382 The refs for which records are to be retrieved. 

383 

384 Returns 

385 ------- 

386 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

387 The matching records indexed by the ref ID. The number of entries 

388 in the dict can be smaller than the number of requested refs. 

389 """ 

390 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

391 

392 # Uniqueness is dataset_id + component so can have multiple records 

393 # per ref. 

394 records_by_ref = defaultdict(list) 

395 for record in records: 

396 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

397 return records_by_ref 

398 

399 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

400 """Return paths and associated dataset refs. 

401 

402 Parameters 

403 ---------- 

404 paths : `list` of `str` or `lsst.resources.ResourcePath` 

405 All the paths to include in search. 

406 

407 Returns 

408 ------- 

409 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

410 Mapping of each path to a set of associated database IDs. 

411 """ 

412 records = self._table.fetch(path=[str(path) for path in paths]) 

413 result = defaultdict(set) 

414 for row in records: 

415 result[row["path"]].add(row["dataset_id"]) 

416 return result 

417 

418 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

419 """Return all dataset refs associated with the supplied path. 

420 

421 Parameters 

422 ---------- 

423 pathInStore : `lsst.resources.ResourcePath` 

424 Path of interest in the data store. 

425 

426 Returns 

427 ------- 

428 ids : `set` of `int` 

429 All `DatasetRef` IDs associated with this path. 

430 """ 

431 records = list(self._table.fetch(path=str(pathInStore))) 

432 ids = {r["dataset_id"] for r in records} 

433 return ids 

434 

435 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

436 # Docstring inherited from GenericBaseDatastore 

437 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

438 

439 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]: 

440 r"""Find all the `Location`\ s of the requested dataset in the 

441 `Datastore` and the associated stored file information. 

442 

443 Parameters 

444 ---------- 

445 ref : `DatasetRef` 

446 Reference to the required `Dataset`. 

447 

448 Returns 

449 ------- 

450 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

451 Location of the dataset within the datastore and 

452 stored information about each file and its formatter. 

453 """ 

454 # Get the file information (this will fail if no file) 

455 records = self.getStoredItemsInfo(ref) 

456 

457 # Use the path to determine the location -- we need to take 

458 # into account absolute URIs in the datastore record 

459 return [(r.file_location(self.locationFactory), r) for r in records] 

460 

461 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

462 """Check that there is only one dataset associated with the 

463 specified artifact. 

464 

465 Parameters 

466 ---------- 

467 ref : `DatasetRef` or `FakeDatasetRef` 

468 Dataset to be removed. 

469 location : `Location` 

470 The location of the artifact to be removed. 

471 

472 Returns 

473 ------- 

474 can_remove : `Bool` 

475 True if the artifact can be safely removed. 

476 """ 

477 # Can't ever delete absolute URIs. 

478 if location.pathInStore.isabs(): 

479 return False 

480 

481 # Get all entries associated with this path 

482 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

483 if not allRefs: 

484 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

485 

486 # Remove these refs from all the refs and if there is nothing left 

487 # then we can delete 

488 remainingRefs = allRefs - {ref.id} 

489 

490 if remainingRefs: 

491 return False 

492 return True 

493 

494 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

495 """Predict the location and related file information of the requested 

496 dataset in this datastore. 

497 

498 Parameters 

499 ---------- 

500 ref : `DatasetRef` 

501 Reference to the required `Dataset`. 

502 

503 Returns 

504 ------- 

505 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

506 Expected Location of the dataset within the datastore and 

507 placeholder information about each file and its formatter. 

508 

509 Notes 

510 ----- 

511 Uses the current configuration to determine how we would expect the 

512 datastore files to have been written if we couldn't ask registry. 

513 This is safe so long as there has been no change to datastore 

514 configuration between writing the dataset and wanting to read it. 

515 Will not work for files that have been ingested without using the 

516 standard file template or default formatter. 

517 """ 

518 # If we have a component ref we always need to ask the questions 

519 # of the composite. If the composite is disassembled this routine 

520 # should return all components. If the composite was not 

521 # disassembled the composite is what is stored regardless of 

522 # component request. Note that if the caller has disassembled 

523 # a composite there is no way for this guess to know that 

524 # without trying both the composite and component ref and seeing 

525 # if there is something at the component Location even without 

526 # disassembly being enabled. 

527 if ref.datasetType.isComponent(): 

528 ref = ref.makeCompositeRef() 

529 

530 # See if the ref is a composite that should be disassembled 

531 doDisassembly = self.composites.shouldBeDisassembled(ref) 

532 

533 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

534 

535 if doDisassembly: 

536 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

537 compRef = ref.makeComponentRef(component) 

538 location, formatter = self._determine_put_formatter_location(compRef) 

539 all_info.append((location, formatter, componentStorage, component)) 

540 

541 else: 

542 # Always use the composite ref if no disassembly 

543 location, formatter = self._determine_put_formatter_location(ref) 

544 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

545 

546 # Convert the list of tuples to have StoredFileInfo as second element 

547 return [ 

548 ( 

549 location, 

550 StoredFileInfo( 

551 formatter=formatter, 

552 path=location.pathInStore.path, 

553 storageClass=storageClass, 

554 component=component, 

555 checksum=None, 

556 file_size=-1, 

557 dataset_id=ref.id, 

558 ), 

559 ) 

560 for location, formatter, storageClass, component in all_info 

561 ] 

562 

563 def _prepare_for_get( 

564 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

565 ) -> list[DatastoreFileGetInformation]: 

566 """Check parameters for ``get`` and obtain formatter and 

567 location. 

568 

569 Parameters 

570 ---------- 

571 ref : `DatasetRef` 

572 Reference to the required Dataset. 

573 parameters : `dict` 

574 `StorageClass`-specific parameters that specify, for example, 

575 a slice of the dataset to be loaded. 

576 

577 Returns 

578 ------- 

579 getInfo : `list` [`DatastoreFileGetInformation`] 

580 Parameters needed to retrieve each file. 

581 """ 

582 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

583 

584 # The storage class we want to use eventually 

585 refStorageClass = ref.datasetType.storageClass 

586 

587 # For trusted mode need to reset storage class. 

588 ref = self._cast_storage_class(ref) 

589 

590 # Get file metadata and internal metadata 

591 fileLocations = self._get_dataset_locations_info(ref) 

592 if not fileLocations: 

593 if not self.trustGetRequest: 

594 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

595 # Assume the dataset is where we think it should be 

596 fileLocations = self._get_expected_dataset_locations_info(ref) 

597 

598 if len(fileLocations) > 1: 

599 disassembled = True 

600 

601 # If trust is involved it is possible that there will be 

602 # components listed here that do not exist in the datastore. 

603 # Explicitly check for file artifact existence and filter out any 

604 # that are missing. 

605 if self.trustGetRequest: 

606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

607 

608 # For now complain only if we have no components at all. One 

609 # component is probably a problem but we can punt that to the 

610 # assembler. 

611 if not fileLocations: 

612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

613 

614 else: 

615 disassembled = False 

616 

617 # Is this a component request? 

618 refComponent = ref.datasetType.component() 

619 

620 fileGetInfo = [] 

621 for location, storedFileInfo in fileLocations: 

622 # The storage class used to write the file 

623 writeStorageClass = storedFileInfo.storageClass 

624 

625 # If this has been disassembled we need read to match the write 

626 if disassembled: 

627 readStorageClass = writeStorageClass 

628 else: 

629 readStorageClass = refStorageClass 

630 

631 formatter = get_instance_of( 

632 storedFileInfo.formatter, 

633 FileDescriptor( 

634 location, 

635 readStorageClass=readStorageClass, 

636 storageClass=writeStorageClass, 

637 parameters=parameters, 

638 ), 

639 ref.dataId, 

640 ) 

641 

642 formatterParams, notFormatterParams = formatter.segregateParameters() 

643 

644 # Of the remaining parameters, extract the ones supported by 

645 # this StorageClass (for components not all will be handled) 

646 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

647 

648 # The ref itself could be a component if the dataset was 

649 # disassembled by butler, or we disassembled in datastore and 

650 # components came from the datastore records 

651 component = storedFileInfo.component if storedFileInfo.component else refComponent 

652 

653 fileGetInfo.append( 

654 DatastoreFileGetInformation( 

655 location, 

656 formatter, 

657 storedFileInfo, 

658 assemblerParams, 

659 formatterParams, 

660 component, 

661 readStorageClass, 

662 ) 

663 ) 

664 

665 return fileGetInfo 

666 

667 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

668 """Check the arguments for ``put`` and obtain formatter and 

669 location. 

670 

671 Parameters 

672 ---------- 

673 inMemoryDataset : `object` 

674 The dataset to store. 

675 ref : `DatasetRef` 

676 Reference to the associated Dataset. 

677 

678 Returns 

679 ------- 

680 location : `Location` 

681 The location to write the dataset. 

682 formatter : `Formatter` 

683 The `Formatter` to use to write the dataset. 

684 

685 Raises 

686 ------ 

687 TypeError 

688 Supplied object and storage class are inconsistent. 

689 DatasetTypeNotSupportedError 

690 The associated `DatasetType` is not handled by this datastore. 

691 """ 

692 self._validate_put_parameters(inMemoryDataset, ref) 

693 return self._determine_put_formatter_location(ref) 

694 

695 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

696 """Calculate the formatter and output location to use for put. 

697 

698 Parameters 

699 ---------- 

700 ref : `DatasetRef` 

701 Reference to the associated Dataset. 

702 

703 Returns 

704 ------- 

705 location : `Location` 

706 The location to write the dataset. 

707 formatter : `Formatter` 

708 The `Formatter` to use to write the dataset. 

709 """ 

710 # Work out output file name 

711 try: 

712 template = self.templates.getTemplate(ref) 

713 except KeyError as e: 

714 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

715 

716 # Validate the template to protect against filenames from different 

717 # dataIds returning the same and causing overwrite confusion. 

718 template.validateTemplate(ref) 

719 

720 location = self.locationFactory.fromPath(template.format(ref)) 

721 

722 # Get the formatter based on the storage class 

723 storageClass = ref.datasetType.storageClass 

724 try: 

725 formatter = self.formatterFactory.getFormatter( 

726 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

727 ) 

728 except KeyError as e: 

729 raise DatasetTypeNotSupportedError( 

730 f"Unable to find formatter for {ref} in datastore {self.name}" 

731 ) from e 

732 

733 # Now that we know the formatter, update the location 

734 location = formatter.makeUpdatedLocation(location) 

735 

736 return location, formatter 

737 

738 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

739 # Docstring inherited from base class 

740 if transfer != "auto": 

741 return transfer 

742 

743 # See if the paths are within the datastore or not 

744 inside = [self._pathInStore(d.path) is not None for d in datasets] 

745 

746 if all(inside): 

747 transfer = None 

748 elif not any(inside): 

749 # Allow ResourcePath to use its own knowledge 

750 transfer = "auto" 

751 else: 

752 # This can happen when importing from a datastore that 

753 # has had some datasets ingested using "direct" mode. 

754 # Also allow ResourcePath to sort it out but warn about it. 

755 # This can happen if you are importing from a datastore 

756 # that had some direct transfer datasets. 

757 log.warning( 

758 "Some datasets are inside the datastore and some are outside. Using 'split' " 

759 "transfer mode. This assumes that the files outside the datastore are " 

760 "still accessible to the new butler since they will not be copied into " 

761 "the target datastore." 

762 ) 

763 transfer = "split" 

764 

765 return transfer 

766 

767 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

768 """Return path relative to datastore root. 

769 

770 Parameters 

771 ---------- 

772 path : `lsst.resources.ResourcePathExpression` 

773 Path to dataset. Can be absolute URI. If relative assumed to 

774 be relative to the datastore. Returns path in datastore 

775 or raises an exception if the path it outside. 

776 

777 Returns 

778 ------- 

779 inStore : `str` 

780 Path relative to datastore root. Returns `None` if the file is 

781 outside the root. 

782 """ 

783 # Relative path will always be relative to datastore 

784 pathUri = ResourcePath(path, forceAbsolute=False) 

785 return pathUri.relative_to(self.root) 

786 

787 def _standardizeIngestPath( 

788 self, path: str | ResourcePath, *, transfer: str | None = None 

789 ) -> str | ResourcePath: 

790 """Standardize the path of a to-be-ingested file. 

791 

792 Parameters 

793 ---------- 

794 path : `str` or `lsst.resources.ResourcePath` 

795 Path of a file to be ingested. This parameter is not expected 

796 to be all the types that can be used to construct a 

797 `~lsst.resources.ResourcePath`. 

798 transfer : `str`, optional 

799 How (and whether) the dataset should be added to the datastore. 

800 See `ingest` for details of transfer modes. 

801 This implementation is provided only so 

802 `NotImplementedError` can be raised if the mode is not supported; 

803 actual transfers are deferred to `_extractIngestInfo`. 

804 

805 Returns 

806 ------- 

807 path : `str` or `lsst.resources.ResourcePath` 

808 New path in what the datastore considers standard form. If an 

809 absolute URI was given that will be returned unchanged. 

810 

811 Notes 

812 ----- 

813 Subclasses of `FileDatastore` can implement this method instead 

814 of `_prepIngest`. It should not modify the data repository or given 

815 file in any way. 

816 

817 Raises 

818 ------ 

819 NotImplementedError 

820 Raised if the datastore does not support the given transfer mode 

821 (including the case where ingest is not supported at all). 

822 FileNotFoundError 

823 Raised if one of the given files does not exist. 

824 """ 

825 if transfer not in (None, "direct", "split") + self.root.transferModes: 

826 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

827 

828 # A relative URI indicates relative to datastore root 

829 srcUri = ResourcePath(path, forceAbsolute=False) 

830 if not srcUri.isabs(): 

831 srcUri = self.root.join(path) 

832 

833 if not srcUri.exists(): 

834 raise FileNotFoundError( 

835 f"Resource at {srcUri} does not exist; note that paths to ingest " 

836 f"are assumed to be relative to {self.root} unless they are absolute." 

837 ) 

838 

839 if transfer is None: 

840 relpath = srcUri.relative_to(self.root) 

841 if not relpath: 

842 raise RuntimeError( 

843 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

844 ) 

845 

846 # Return the relative path within the datastore for internal 

847 # transfer 

848 path = relpath 

849 

850 return path 

851 

852 def _extractIngestInfo( 

853 self, 

854 path: ResourcePathExpression, 

855 ref: DatasetRef, 

856 *, 

857 formatter: Formatter | type[Formatter], 

858 transfer: str | None = None, 

859 record_validation_info: bool = True, 

860 ) -> StoredFileInfo: 

861 """Relocate (if necessary) and extract `StoredFileInfo` from a 

862 to-be-ingested file. 

863 

864 Parameters 

865 ---------- 

866 path : `lsst.resources.ResourcePathExpression` 

867 URI or path of a file to be ingested. 

868 ref : `DatasetRef` 

869 Reference for the dataset being ingested. Guaranteed to have 

870 ``dataset_id not None`. 

871 formatter : `type` or `Formatter` 

872 `Formatter` subclass to use for this dataset or an instance. 

873 transfer : `str`, optional 

874 How (and whether) the dataset should be added to the datastore. 

875 See `ingest` for details of transfer modes. 

876 record_validation_info : `bool`, optional 

877 If `True`, the default, the datastore can record validation 

878 information associated with the file. If `False` the datastore 

879 will not attempt to track any information such as checksums 

880 or file sizes. This can be useful if such information is tracked 

881 in an external system or if the file is to be compressed in place. 

882 It is up to the datastore whether this parameter is relevant. 

883 

884 Returns 

885 ------- 

886 info : `StoredFileInfo` 

887 Internal datastore record for this file. This will be inserted by 

888 the caller; the `_extractIngestInfo` is only responsible for 

889 creating and populating the struct. 

890 

891 Raises 

892 ------ 

893 FileNotFoundError 

894 Raised if one of the given files does not exist. 

895 FileExistsError 

896 Raised if transfer is not `None` but the (internal) location the 

897 file would be moved to is already occupied. 

898 """ 

899 if self._transaction is None: 

900 raise RuntimeError("Ingest called without transaction enabled") 

901 

902 # Create URI of the source path, do not need to force a relative 

903 # path to absolute. 

904 srcUri = ResourcePath(path, forceAbsolute=False) 

905 

906 # Track whether we have read the size of the source yet 

907 have_sized = False 

908 

909 tgtLocation: Location | None 

910 if transfer is None or transfer == "split": 

911 # A relative path is assumed to be relative to the datastore 

912 # in this context 

913 if not srcUri.isabs(): 

914 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

915 else: 

916 # Work out the path in the datastore from an absolute URI 

917 # This is required to be within the datastore. 

918 pathInStore = srcUri.relative_to(self.root) 

919 if pathInStore is None and transfer is None: 

920 raise RuntimeError( 

921 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

922 ) 

923 if pathInStore: 

924 tgtLocation = self.locationFactory.fromPath(pathInStore) 

925 elif transfer == "split": 

926 # Outside the datastore but treat that as a direct ingest 

927 # instead. 

928 tgtLocation = None 

929 else: 

930 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

931 elif transfer == "direct": 

932 # Want to store the full URI to the resource directly in 

933 # datastore. This is useful for referring to permanent archive 

934 # storage for raw data. 

935 # Trust that people know what they are doing. 

936 tgtLocation = None 

937 else: 

938 # Work out the name we want this ingested file to have 

939 # inside the datastore 

940 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

941 if not tgtLocation.uri.dirname().exists(): 

942 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

943 tgtLocation.uri.dirname().mkdir() 

944 

945 # if we are transferring from a local file to a remote location 

946 # it may be more efficient to get the size and checksum of the 

947 # local file rather than the transferred one 

948 if record_validation_info and srcUri.isLocal: 

949 size = srcUri.size() 

950 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

951 have_sized = True 

952 

953 # Transfer the resource to the destination. 

954 # Allow overwrite of an existing file. This matches the behavior 

955 # of datastore.put() in that it trusts that registry would not 

956 # be asking to overwrite unless registry thought that the 

957 # overwrite was allowed. 

958 tgtLocation.uri.transfer_from( 

959 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

960 ) 

961 

962 if tgtLocation is None: 

963 # This means we are using direct mode 

964 targetUri = srcUri 

965 targetPath = str(srcUri) 

966 else: 

967 targetUri = tgtLocation.uri 

968 targetPath = tgtLocation.pathInStore.path 

969 

970 # the file should exist in the datastore now 

971 if record_validation_info: 

972 if not have_sized: 

973 size = targetUri.size() 

974 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

975 else: 

976 # Not recording any file information. 

977 size = -1 

978 checksum = None 

979 

980 return StoredFileInfo( 

981 formatter=formatter, 

982 path=targetPath, 

983 storageClass=ref.datasetType.storageClass, 

984 component=ref.datasetType.component(), 

985 file_size=size, 

986 checksum=checksum, 

987 dataset_id=ref.id, 

988 ) 

989 

990 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

991 # Docstring inherited from Datastore._prepIngest. 

992 filtered = [] 

993 for dataset in datasets: 

994 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

995 if not acceptable: 

996 continue 

997 else: 

998 dataset.refs = acceptable 

999 if dataset.formatter is None: 

1000 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1001 else: 

1002 assert isinstance(dataset.formatter, (type, str)) 

1003 formatter_class = get_class_of(dataset.formatter) 

1004 if not issubclass(formatter_class, Formatter): 

1005 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1006 dataset.formatter = formatter_class 

1007 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1008 filtered.append(dataset) 

1009 return _IngestPrepData(filtered) 

1010 

1011 @transactional 

1012 def _finishIngest( 

1013 self, 

1014 prepData: Datastore.IngestPrepData, 

1015 *, 

1016 transfer: str | None = None, 

1017 record_validation_info: bool = True, 

1018 ) -> None: 

1019 # Docstring inherited from Datastore._finishIngest. 

1020 refsAndInfos = [] 

1021 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1022 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1023 # Do ingest as if the first dataset ref is associated with the file 

1024 info = self._extractIngestInfo( 

1025 dataset.path, 

1026 dataset.refs[0], 

1027 formatter=dataset.formatter, 

1028 transfer=transfer, 

1029 record_validation_info=record_validation_info, 

1030 ) 

1031 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1032 self._register_datasets(refsAndInfos) 

1033 

1034 def _calculate_ingested_datastore_name( 

1035 self, 

1036 srcUri: ResourcePath, 

1037 ref: DatasetRef, 

1038 formatter: Formatter | type[Formatter] | None = None, 

1039 ) -> Location: 

1040 """Given a source URI and a DatasetRef, determine the name the 

1041 dataset will have inside datastore. 

1042 

1043 Parameters 

1044 ---------- 

1045 srcUri : `lsst.resources.ResourcePath` 

1046 URI to the source dataset file. 

1047 ref : `DatasetRef` 

1048 Ref associated with the newly-ingested dataset artifact. This 

1049 is used to determine the name within the datastore. 

1050 formatter : `Formatter` or Formatter class. 

1051 Formatter to use for validation. Can be a class or an instance. 

1052 No validation of the file extension is performed if the 

1053 ``formatter`` is `None`. This can be used if the caller knows 

1054 that the source URI and target URI will use the same formatter. 

1055 

1056 Returns 

1057 ------- 

1058 location : `Location` 

1059 Target location for the newly-ingested dataset. 

1060 """ 

1061 # Ingesting a file from outside the datastore. 

1062 # This involves a new name. 

1063 template = self.templates.getTemplate(ref) 

1064 location = self.locationFactory.fromPath(template.format(ref)) 

1065 

1066 # Get the extension 

1067 ext = srcUri.getExtension() 

1068 

1069 # Update the destination to include that extension 

1070 location.updateExtension(ext) 

1071 

1072 # Ask the formatter to validate this extension 

1073 if formatter is not None: 

1074 formatter.validateExtension(location) 

1075 

1076 return location 

1077 

1078 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1079 """Write out in memory dataset to datastore. 

1080 

1081 Parameters 

1082 ---------- 

1083 inMemoryDataset : `object` 

1084 Dataset to write to datastore. 

1085 ref : `DatasetRef` 

1086 Registry information associated with this dataset. 

1087 

1088 Returns 

1089 ------- 

1090 info : `StoredFileInfo` 

1091 Information describing the artifact written to the datastore. 

1092 """ 

1093 # May need to coerce the in memory dataset to the correct 

1094 # python type, but first we need to make sure the storage class 

1095 # reflects the one defined in the data repository. 

1096 ref = self._cast_storage_class(ref) 

1097 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1098 

1099 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1100 uri = location.uri 

1101 

1102 if not uri.dirname().exists(): 

1103 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1104 uri.dirname().mkdir() 

1105 

1106 if self._transaction is None: 

1107 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1108 

1109 def _removeFileExists(uri: ResourcePath) -> None: 

1110 """Remove a file and do not complain if it is not there. 

1111 

1112 This is important since a formatter might fail before the file 

1113 is written and we should not confuse people by writing spurious 

1114 error messages to the log. 

1115 """ 

1116 try: 

1117 uri.remove() 

1118 except FileNotFoundError: 

1119 pass 

1120 

1121 # Register a callback to try to delete the uploaded data if 

1122 # something fails below 

1123 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1124 

1125 data_written = False 

1126 if not uri.isLocal: 

1127 # This is a remote URI. Some datasets can be serialized directly 

1128 # to bytes and sent to the remote datastore without writing a 

1129 # file. If the dataset is intended to be saved to the cache 

1130 # a file is always written and direct write to the remote 

1131 # datastore is bypassed. 

1132 if not self.cacheManager.should_be_cached(ref): 

1133 try: 

1134 serializedDataset = formatter.toBytes(inMemoryDataset) 

1135 except NotImplementedError: 

1136 # Fallback to the file writing option. 

1137 pass 

1138 except Exception as e: 

1139 raise RuntimeError( 

1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1141 ) from e 

1142 else: 

1143 log.debug("Writing bytes directly to %s", uri) 

1144 uri.write(serializedDataset, overwrite=True) 

1145 log.debug("Successfully wrote bytes directly to %s", uri) 

1146 data_written = True 

1147 

1148 if not data_written: 

1149 # Did not write the bytes directly to object store so instead 

1150 # write to temporary file. Always write to a temporary even if 

1151 # using a local file system -- that gives us atomic writes. 

1152 # If a process is killed as the file is being written we do not 

1153 # want it to remain in the correct place but in corrupt state. 

1154 # For local files write to the output directory not temporary dir. 

1155 prefix = uri.dirname() if uri.isLocal else None 

1156 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1157 # Need to configure the formatter to write to a different 

1158 # location and that needs us to overwrite internals 

1159 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1160 with formatter._updateLocation(Location(None, temporary_uri)): 

1161 try: 

1162 formatter.write(inMemoryDataset) 

1163 except Exception as e: 

1164 raise RuntimeError( 

1165 f"Failed to serialize dataset {ref} of type" 

1166 f" {type(inMemoryDataset)} to " 

1167 f"temporary location {temporary_uri}" 

1168 ) from e 

1169 

1170 # Use move for a local file since that becomes an efficient 

1171 # os.rename. For remote resources we use copy to allow the 

1172 # file to be cached afterwards. 

1173 transfer = "move" if uri.isLocal else "copy" 

1174 

1175 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1176 

1177 if transfer == "copy": 

1178 # Cache if required 

1179 self.cacheManager.move_to_cache(temporary_uri, ref) 

1180 

1181 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1182 

1183 # URI is needed to resolve what ingest case are we dealing with 

1184 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1185 

1186 def _read_artifact_into_memory( 

1187 self, 

1188 getInfo: DatastoreFileGetInformation, 

1189 ref: DatasetRef, 

1190 isComponent: bool = False, 

1191 cache_ref: DatasetRef | None = None, 

1192 ) -> Any: 

1193 """Read the artifact from datastore into in memory object. 

1194 

1195 Parameters 

1196 ---------- 

1197 getInfo : `DatastoreFileGetInformation` 

1198 Information about the artifact within the datastore. 

1199 ref : `DatasetRef` 

1200 The registry information associated with this artifact. 

1201 isComponent : `bool` 

1202 Flag to indicate if a component is being read from this artifact. 

1203 cache_ref : `DatasetRef`, optional 

1204 The DatasetRef to use when looking up the file in the cache. 

1205 This ref must have the same ID as the supplied ref but can 

1206 be a parent ref or component ref to indicate to the cache whether 

1207 a composite file is being requested from the cache or a component 

1208 file. Without this the cache will default to the supplied ref but 

1209 it can get confused with read-only derived components for 

1210 disassembled composites. 

1211 

1212 Returns 

1213 ------- 

1214 inMemoryDataset : `object` 

1215 The artifact as a python object. 

1216 """ 

1217 location = getInfo.location 

1218 uri = location.uri 

1219 log.debug("Accessing data from %s", uri) 

1220 

1221 if cache_ref is None: 

1222 cache_ref = ref 

1223 if cache_ref.id != ref.id: 

1224 raise ValueError( 

1225 "The supplied cache dataset ref refers to a different dataset than expected:" 

1226 f" {ref.id} != {cache_ref.id}" 

1227 ) 

1228 

1229 # Cannot recalculate checksum but can compare size as a quick check 

1230 # Do not do this if the size is negative since that indicates 

1231 # we do not know. 

1232 recorded_size = getInfo.info.file_size 

1233 resource_size = uri.size() 

1234 if recorded_size >= 0 and resource_size != recorded_size: 

1235 raise RuntimeError( 

1236 "Integrity failure in Datastore. " 

1237 f"Size of file {uri} ({resource_size}) " 

1238 f"does not match size recorded in registry of {recorded_size}" 

1239 ) 

1240 

1241 # For the general case we have choices for how to proceed. 

1242 # 1. Always use a local file (downloading the remote resource to a 

1243 # temporary file if needed). 

1244 # 2. Use a threshold size and read into memory and use bytes. 

1245 # Use both for now with an arbitrary hand off size. 

1246 # This allows small datasets to be downloaded from remote object 

1247 # stores without requiring a temporary file. 

1248 

1249 formatter = getInfo.formatter 

1250 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1251 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1252 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1253 if cached_file is not None: 

1254 desired_uri = cached_file 

1255 msg = f" (cached version of {uri})" 

1256 else: 

1257 desired_uri = uri 

1258 msg = "" 

1259 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1260 serializedDataset = desired_uri.read() 

1261 log.debug( 

1262 "Deserializing %s from %d bytes from location %s with formatter %s", 

1263 f"component {getInfo.component}" if isComponent else "", 

1264 len(serializedDataset), 

1265 uri, 

1266 formatter.name(), 

1267 ) 

1268 try: 

1269 result = formatter.fromBytes( 

1270 serializedDataset, component=getInfo.component if isComponent else None 

1271 ) 

1272 except Exception as e: 

1273 raise ValueError( 

1274 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1275 f" ({ref.datasetType.name} from {uri}): {e}" 

1276 ) from e 

1277 else: 

1278 # Read from file. 

1279 

1280 # Have to update the Location associated with the formatter 

1281 # because formatter.read does not allow an override. 

1282 # This could be improved. 

1283 location_updated = False 

1284 msg = "" 

1285 

1286 # First check in cache for local version. 

1287 # The cache will only be relevant for remote resources but 

1288 # no harm in always asking. Context manager ensures that cache 

1289 # file is not deleted during cache expiration. 

1290 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1291 if cached_file is not None: 

1292 msg = f"(via cache read of remote file {uri})" 

1293 uri = cached_file 

1294 location_updated = True 

1295 

1296 with uri.as_local() as local_uri: 

1297 can_be_cached = False 

1298 if uri != local_uri: 

1299 # URI was remote and file was downloaded 

1300 cache_msg = "" 

1301 location_updated = True 

1302 

1303 if self.cacheManager.should_be_cached(cache_ref): 

1304 # In this scenario we want to ask if the downloaded 

1305 # file should be cached but we should not cache 

1306 # it until after we've used it (to ensure it can't 

1307 # be expired whilst we are using it). 

1308 can_be_cached = True 

1309 

1310 # Say that it is "likely" to be cached because 

1311 # if the formatter read fails we will not be 

1312 # caching this file. 

1313 cache_msg = " and likely cached" 

1314 

1315 msg = f"(via download to local file{cache_msg})" 

1316 

1317 # Calculate the (possibly) new location for the formatter 

1318 # to use. 

1319 newLocation = Location(*local_uri.split()) if location_updated else None 

1320 

1321 log.debug( 

1322 "Reading%s from location %s %s with formatter %s", 

1323 f" component {getInfo.component}" if isComponent else "", 

1324 uri, 

1325 msg, 

1326 formatter.name(), 

1327 ) 

1328 try: 

1329 with formatter._updateLocation(newLocation): 

1330 with time_this( 

1331 log, 

1332 msg="Reading%s from location %s %s with formatter %s", 

1333 args=( 

1334 f" component {getInfo.component}" if isComponent else "", 

1335 uri, 

1336 msg, 

1337 formatter.name(), 

1338 ), 

1339 ): 

1340 result = formatter.read(component=getInfo.component if isComponent else None) 

1341 except Exception as e: 

1342 raise ValueError( 

1343 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1344 f" ({ref.datasetType.name} from {uri}): {e}" 

1345 ) from e 

1346 

1347 # File was read successfully so can move to cache 

1348 if can_be_cached: 

1349 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1350 

1351 return self._post_process_get( 

1352 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1353 ) 

1354 

1355 def knows(self, ref: DatasetRef) -> bool: 

1356 """Check if the dataset is known to the datastore. 

1357 

1358 Does not check for existence of any artifact. 

1359 

1360 Parameters 

1361 ---------- 

1362 ref : `DatasetRef` 

1363 Reference to the required dataset. 

1364 

1365 Returns 

1366 ------- 

1367 exists : `bool` 

1368 `True` if the dataset is known to the datastore. 

1369 """ 

1370 fileLocations = self._get_dataset_locations_info(ref) 

1371 if fileLocations: 

1372 return True 

1373 return False 

1374 

1375 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1376 # Docstring inherited from the base class. 

1377 

1378 # The records themselves. Could be missing some entries. 

1379 records = self._get_stored_records_associated_with_refs(refs) 

1380 

1381 return {ref: ref.id in records for ref in refs} 

1382 

1383 def _process_mexists_records( 

1384 self, 

1385 id_to_ref: dict[DatasetId, DatasetRef], 

1386 records: dict[DatasetId, list[StoredFileInfo]], 

1387 all_required: bool, 

1388 artifact_existence: dict[ResourcePath, bool] | None = None, 

1389 ) -> dict[DatasetRef, bool]: 

1390 """Check given records for existence. 

1391 

1392 Helper function for `mexists()`. 

1393 

1394 Parameters 

1395 ---------- 

1396 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1397 Mapping of the dataset ID to the dataset ref itself. 

1398 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1399 Records as generally returned by 

1400 ``_get_stored_records_associated_with_refs``. 

1401 all_required : `bool` 

1402 Flag to indicate whether existence requires all artifacts 

1403 associated with a dataset ID to exist or not for existence. 

1404 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1405 Optional mapping of datastore artifact to existence. Updated by 

1406 this method with details of all artifacts tested. Can be `None` 

1407 if the caller is not interested. 

1408 

1409 Returns 

1410 ------- 

1411 existence : `dict` of [`DatasetRef`, `bool`] 

1412 Mapping from dataset to boolean indicating existence. 

1413 """ 

1414 # The URIs to be checked and a mapping of those URIs to 

1415 # the dataset ID. 

1416 uris_to_check: list[ResourcePath] = [] 

1417 location_map: dict[ResourcePath, DatasetId] = {} 

1418 

1419 location_factory = self.locationFactory 

1420 

1421 uri_existence: dict[ResourcePath, bool] = {} 

1422 for ref_id, infos in records.items(): 

1423 # Key is the dataset Id, value is list of StoredItemInfo 

1424 uris = [info.file_location(location_factory).uri for info in infos] 

1425 location_map.update({uri: ref_id for uri in uris}) 

1426 

1427 # Check the local cache directly for a dataset corresponding 

1428 # to the remote URI. 

1429 if self.cacheManager.file_count > 0: 

1430 ref = id_to_ref[ref_id] 

1431 for uri, storedFileInfo in zip(uris, infos): 

1432 check_ref = ref 

1433 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1434 check_ref = ref.makeComponentRef(component) 

1435 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1436 # Proxy for URI existence. 

1437 uri_existence[uri] = True 

1438 else: 

1439 uris_to_check.append(uri) 

1440 else: 

1441 # Check all of them. 

1442 uris_to_check.extend(uris) 

1443 

1444 if artifact_existence is not None: 

1445 # If a URI has already been checked remove it from the list 

1446 # and immediately add the status to the output dict. 

1447 filtered_uris_to_check = [] 

1448 for uri in uris_to_check: 

1449 if uri in artifact_existence: 

1450 uri_existence[uri] = artifact_existence[uri] 

1451 else: 

1452 filtered_uris_to_check.append(uri) 

1453 uris_to_check = filtered_uris_to_check 

1454 

1455 # Results. 

1456 dataset_existence: dict[DatasetRef, bool] = {} 

1457 

1458 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1459 for uri, exists in uri_existence.items(): 

1460 dataset_id = location_map[uri] 

1461 ref = id_to_ref[dataset_id] 

1462 

1463 # Disassembled composite needs to check all locations. 

1464 # all_required indicates whether all need to exist or not. 

1465 if ref in dataset_existence: 

1466 if all_required: 

1467 exists = dataset_existence[ref] and exists 

1468 else: 

1469 exists = dataset_existence[ref] or exists 

1470 dataset_existence[ref] = exists 

1471 

1472 if artifact_existence is not None: 

1473 artifact_existence.update(uri_existence) 

1474 

1475 return dataset_existence 

1476 

1477 def mexists( 

1478 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1479 ) -> dict[DatasetRef, bool]: 

1480 """Check the existence of multiple datasets at once. 

1481 

1482 Parameters 

1483 ---------- 

1484 refs : iterable of `DatasetRef` 

1485 The datasets to be checked. 

1486 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1487 Optional mapping of datastore artifact to existence. Updated by 

1488 this method with details of all artifacts tested. Can be `None` 

1489 if the caller is not interested. 

1490 

1491 Returns 

1492 ------- 

1493 existence : `dict` of [`DatasetRef`, `bool`] 

1494 Mapping from dataset to boolean indicating existence. 

1495 

1496 Notes 

1497 ----- 

1498 To minimize potentially costly remote existence checks, the local 

1499 cache is checked as a proxy for existence. If a file for this 

1500 `DatasetRef` does exist no check is done for the actual URI. This 

1501 could result in possibly unexpected behavior if the dataset itself 

1502 has been removed from the datastore by another process whilst it is 

1503 still in the cache. 

1504 """ 

1505 chunk_size = 10_000 

1506 dataset_existence: dict[DatasetRef, bool] = {} 

1507 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1508 n_found_total = 0 

1509 n_checked = 0 

1510 n_chunks = 0 

1511 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1512 chunk_result = self._mexists(chunk, artifact_existence) 

1513 

1514 # The log message level and content depend on how many 

1515 # datasets we are processing. 

1516 n_results = len(chunk_result) 

1517 

1518 # Use verbose logging to ensure that messages can be seen 

1519 # easily if many refs are being checked. 

1520 log_threshold = VERBOSE 

1521 n_checked += n_results 

1522 

1523 # This sum can take some time so only do it if we know the 

1524 # result is going to be used. 

1525 n_found = 0 

1526 if log.isEnabledFor(log_threshold): 

1527 # Can treat the booleans as 0, 1 integers and sum them. 

1528 n_found = sum(chunk_result.values()) 

1529 n_found_total += n_found 

1530 

1531 # We are deliberately not trying to count the number of refs 

1532 # provided in case it's in the millions. This means there is a 

1533 # situation where the number of refs exactly matches the chunk 

1534 # size and we will switch to the multi-chunk path even though 

1535 # we only have a single chunk. 

1536 if n_results < chunk_size and n_chunks == 0: 

1537 # Single chunk will be processed so we can provide more detail. 

1538 if n_results == 1: 

1539 ref = list(chunk_result)[0] 

1540 # Use debug logging to be consistent with `exists()`. 

1541 log.debug( 

1542 "Calling mexists() with single ref that does%s exist (%s).", 

1543 "" if chunk_result[ref] else " not", 

1544 ref, 

1545 ) 

1546 else: 

1547 # Single chunk but multiple files. Summarize. 

1548 log.log( 

1549 log_threshold, 

1550 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1551 n_found, 

1552 n_checked, 

1553 ) 

1554 

1555 else: 

1556 # Use incremental verbose logging when we have multiple chunks. 

1557 log.log( 

1558 log_threshold, 

1559 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1560 "(running total from all chunks so far: %d found out of %d checked)", 

1561 n_chunks, 

1562 n_found, 

1563 n_results, 

1564 n_found_total, 

1565 n_checked, 

1566 ) 

1567 dataset_existence.update(chunk_result) 

1568 n_chunks += 1 

1569 

1570 return dataset_existence 

1571 

1572 def _mexists( 

1573 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1574 ) -> dict[DatasetRef, bool]: 

1575 """Check the existence of multiple datasets at once. 

1576 

1577 Parameters 

1578 ---------- 

1579 refs : iterable of `DatasetRef` 

1580 The datasets to be checked. 

1581 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1582 Optional mapping of datastore artifact to existence. Updated by 

1583 this method with details of all artifacts tested. Can be `None` 

1584 if the caller is not interested. 

1585 

1586 Returns 

1587 ------- 

1588 existence : `dict` of [`DatasetRef`, `bool`] 

1589 Mapping from dataset to boolean indicating existence. 

1590 """ 

1591 # Make a mapping from refs with the internal storage class to the given 

1592 # refs that may have a different one. We'll use the internal refs 

1593 # throughout this method and convert back at the very end. 

1594 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1595 

1596 # Need a mapping of dataset_id to (internal) dataset ref since some 

1597 # internal APIs work with dataset_id. 

1598 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1599 

1600 # Set of all IDs we are checking for. 

1601 requested_ids = set(id_to_ref.keys()) 

1602 

1603 # The records themselves. Could be missing some entries. 

1604 records = self._get_stored_records_associated_with_refs(id_to_ref.values()) 

1605 

1606 dataset_existence = self._process_mexists_records( 

1607 id_to_ref, records, True, artifact_existence=artifact_existence 

1608 ) 

1609 

1610 # Set of IDs that have been handled. 

1611 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1612 

1613 missing_ids = requested_ids - handled_ids 

1614 if missing_ids: 

1615 dataset_existence.update( 

1616 self._mexists_check_expected( 

1617 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1618 ) 

1619 ) 

1620 

1621 return { 

1622 internal_ref_to_input_ref[internal_ref]: existence 

1623 for internal_ref, existence in dataset_existence.items() 

1624 } 

1625 

1626 def _mexists_check_expected( 

1627 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1628 ) -> dict[DatasetRef, bool]: 

1629 """Check existence of refs that are not known to datastore. 

1630 

1631 Parameters 

1632 ---------- 

1633 refs : iterable of `DatasetRef` 

1634 The datasets to be checked. These are assumed not to be known 

1635 to datastore. 

1636 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1637 Optional mapping of datastore artifact to existence. Updated by 

1638 this method with details of all artifacts tested. Can be `None` 

1639 if the caller is not interested. 

1640 

1641 Returns 

1642 ------- 

1643 existence : `dict` of [`DatasetRef`, `bool`] 

1644 Mapping from dataset to boolean indicating existence. 

1645 """ 

1646 dataset_existence: dict[DatasetRef, bool] = {} 

1647 if not self.trustGetRequest: 

1648 # Must assume these do not exist 

1649 for ref in refs: 

1650 dataset_existence[ref] = False 

1651 else: 

1652 log.debug( 

1653 "%d datasets were not known to datastore during initial existence check.", 

1654 len(refs), 

1655 ) 

1656 

1657 # Construct data structure identical to that returned 

1658 # by _get_stored_records_associated_with_refs() but using 

1659 # guessed names. 

1660 records = {} 

1661 id_to_ref = {} 

1662 for missing_ref in refs: 

1663 expected = self._get_expected_dataset_locations_info(missing_ref) 

1664 dataset_id = missing_ref.id 

1665 records[dataset_id] = [info for _, info in expected] 

1666 id_to_ref[dataset_id] = missing_ref 

1667 

1668 dataset_existence.update( 

1669 self._process_mexists_records( 

1670 id_to_ref, 

1671 records, 

1672 False, 

1673 artifact_existence=artifact_existence, 

1674 ) 

1675 ) 

1676 

1677 return dataset_existence 

1678 

1679 def exists(self, ref: DatasetRef) -> bool: 

1680 """Check if the dataset exists in the datastore. 

1681 

1682 Parameters 

1683 ---------- 

1684 ref : `DatasetRef` 

1685 Reference to the required dataset. 

1686 

1687 Returns 

1688 ------- 

1689 exists : `bool` 

1690 `True` if the entity exists in the `Datastore`. 

1691 

1692 Notes 

1693 ----- 

1694 The local cache is checked as a proxy for existence in the remote 

1695 object store. It is possible that another process on a different 

1696 compute node could remove the file from the object store even 

1697 though it is present in the local cache. 

1698 """ 

1699 ref = self._cast_storage_class(ref) 

1700 fileLocations = self._get_dataset_locations_info(ref) 

1701 

1702 # if we are being asked to trust that registry might not be correct 

1703 # we ask for the expected locations and check them explicitly 

1704 if not fileLocations: 

1705 if not self.trustGetRequest: 

1706 return False 

1707 

1708 # First check the cache. If it is not found we must check 

1709 # the datastore itself. Assume that any component in the cache 

1710 # means that the dataset does exist somewhere. 

1711 if self.cacheManager.known_to_cache(ref): 

1712 return True 

1713 

1714 # When we are guessing a dataset location we can not check 

1715 # for the existence of every component since we can not 

1716 # know if every component was written. Instead we check 

1717 # for the existence of any of the expected locations. 

1718 for location, _ in self._get_expected_dataset_locations_info(ref): 

1719 if self._artifact_exists(location): 

1720 return True 

1721 return False 

1722 

1723 # All listed artifacts must exist. 

1724 for location, storedFileInfo in fileLocations: 

1725 # Checking in cache needs the component ref. 

1726 check_ref = ref 

1727 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1728 check_ref = ref.makeComponentRef(component) 

1729 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1730 continue 

1731 

1732 if not self._artifact_exists(location): 

1733 return False 

1734 

1735 return True 

1736 

1737 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1738 """Return URIs associated with dataset. 

1739 

1740 Parameters 

1741 ---------- 

1742 ref : `DatasetRef` 

1743 Reference to the required dataset. 

1744 predict : `bool`, optional 

1745 If the datastore does not know about the dataset, should it 

1746 return a predicted URI or not? 

1747 

1748 Returns 

1749 ------- 

1750 uris : `DatasetRefURIs` 

1751 The URI to the primary artifact associated with this dataset (if 

1752 the dataset was disassembled within the datastore this may be 

1753 `None`), and the URIs to any components associated with the dataset 

1754 artifact. (can be empty if there are no components). 

1755 """ 

1756 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1757 return many[ref] 

1758 

1759 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1760 """URI to the Dataset. 

1761 

1762 Parameters 

1763 ---------- 

1764 ref : `DatasetRef` 

1765 Reference to the required Dataset. 

1766 predict : `bool` 

1767 If `True`, allow URIs to be returned of datasets that have not 

1768 been written. 

1769 

1770 Returns 

1771 ------- 

1772 uri : `str` 

1773 URI pointing to the dataset within the datastore. If the 

1774 dataset does not exist in the datastore, and if ``predict`` is 

1775 `True`, the URI will be a prediction and will include a URI 

1776 fragment "#predicted". 

1777 If the datastore does not have entities that relate well 

1778 to the concept of a URI the returned URI will be 

1779 descriptive. The returned URI is not guaranteed to be obtainable. 

1780 

1781 Raises 

1782 ------ 

1783 FileNotFoundError 

1784 Raised if a URI has been requested for a dataset that does not 

1785 exist and guessing is not allowed. 

1786 RuntimeError 

1787 Raised if a request is made for a single URI but multiple URIs 

1788 are associated with this dataset. 

1789 

1790 Notes 

1791 ----- 

1792 When a predicted URI is requested an attempt will be made to form 

1793 a reasonable URI based on file templates and the expected formatter. 

1794 """ 

1795 primary, components = self.getURIs(ref, predict) 

1796 if primary is None or components: 

1797 raise RuntimeError( 

1798 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1799 ) 

1800 return primary 

1801 

1802 def _predict_URIs( 

1803 self, 

1804 ref: DatasetRef, 

1805 ) -> DatasetRefURIs: 

1806 """Predict the URIs of a dataset ref. 

1807 

1808 Parameters 

1809 ---------- 

1810 ref : `DatasetRef` 

1811 Reference to the required Dataset. 

1812 

1813 Returns 

1814 ------- 

1815 URI : DatasetRefUris 

1816 Primary and component URIs. URIs will contain a URI fragment 

1817 "#predicted". 

1818 """ 

1819 uris = DatasetRefURIs() 

1820 

1821 if self.composites.shouldBeDisassembled(ref): 

1822 for component, _ in ref.datasetType.storageClass.components.items(): 

1823 comp_ref = ref.makeComponentRef(component) 

1824 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1825 

1826 # Add the "#predicted" URI fragment to indicate this is a 

1827 # guess 

1828 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1829 

1830 else: 

1831 location, _ = self._determine_put_formatter_location(ref) 

1832 

1833 # Add the "#predicted" URI fragment to indicate this is a guess 

1834 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1835 

1836 return uris 

1837 

1838 def getManyURIs( 

1839 self, 

1840 refs: Iterable[DatasetRef], 

1841 predict: bool = False, 

1842 allow_missing: bool = False, 

1843 ) -> dict[DatasetRef, DatasetRefURIs]: 

1844 # Docstring inherited 

1845 

1846 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1847 

1848 records = self._get_stored_records_associated_with_refs(refs) 

1849 records_keys = records.keys() 

1850 

1851 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1852 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1853 

1854 # Have to handle trustGetRequest mode by checking for the existence 

1855 # of the missing refs on disk. 

1856 if missing_refs: 

1857 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1858 really_missing = set() 

1859 not_missing = set() 

1860 for ref, exists in dataset_existence.items(): 

1861 if exists: 

1862 not_missing.add(ref) 

1863 else: 

1864 really_missing.add(ref) 

1865 

1866 if not_missing: 

1867 # Need to recalculate the missing/existing split. 

1868 existing_refs = existing_refs + tuple(not_missing) 

1869 missing_refs = tuple(really_missing) 

1870 

1871 for ref in missing_refs: 

1872 # if this has never been written then we have to guess 

1873 if not predict: 

1874 if not allow_missing: 

1875 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1876 else: 

1877 uris[ref] = self._predict_URIs(ref) 

1878 

1879 for ref in existing_refs: 

1880 file_infos = records[ref.id] 

1881 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1882 uris[ref] = self._locations_to_URI(ref, file_locations) 

1883 

1884 return uris 

1885 

1886 def _locations_to_URI( 

1887 self, 

1888 ref: DatasetRef, 

1889 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1890 ) -> DatasetRefURIs: 

1891 """Convert one or more file locations associated with a DatasetRef 

1892 to a DatasetRefURIs. 

1893 

1894 Parameters 

1895 ---------- 

1896 ref : `DatasetRef` 

1897 Reference to the dataset. 

1898 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1899 Each item in the sequence is the location of the dataset within the 

1900 datastore and stored information about the file and its formatter. 

1901 If there is only one item in the sequence then it is treated as the 

1902 primary URI. If there is more than one item then they are treated 

1903 as component URIs. If there are no items then an error is raised 

1904 unless ``self.trustGetRequest`` is `True`. 

1905 

1906 Returns 

1907 ------- 

1908 uris: DatasetRefURIs 

1909 Represents the primary URI or component URIs described by the 

1910 inputs. 

1911 

1912 Raises 

1913 ------ 

1914 RuntimeError 

1915 If no file locations are passed in and ``self.trustGetRequest`` is 

1916 `False`. 

1917 FileNotFoundError 

1918 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1919 is `False`. 

1920 RuntimeError 

1921 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1922 unexpected). 

1923 """ 

1924 guessing = False 

1925 uris = DatasetRefURIs() 

1926 

1927 if not file_locations: 

1928 if not self.trustGetRequest: 

1929 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1930 file_locations = self._get_expected_dataset_locations_info(ref) 

1931 guessing = True 

1932 

1933 if len(file_locations) == 1: 

1934 # No disassembly so this is the primary URI 

1935 uris.primaryURI = file_locations[0][0].uri 

1936 if guessing and not uris.primaryURI.exists(): 

1937 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1938 else: 

1939 for location, file_info in file_locations: 

1940 if file_info.component is None: 

1941 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1942 if guessing and not location.uri.exists(): 

1943 # If we are trusting then it is entirely possible for 

1944 # some components to be missing. In that case we skip 

1945 # to the next component. 

1946 if self.trustGetRequest: 

1947 continue 

1948 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1949 uris.componentURIs[file_info.component] = location.uri 

1950 

1951 return uris 

1952 

1953 def retrieveArtifacts( 

1954 self, 

1955 refs: Iterable[DatasetRef], 

1956 destination: ResourcePath, 

1957 transfer: str = "auto", 

1958 preserve_path: bool = True, 

1959 overwrite: bool = False, 

1960 ) -> list[ResourcePath]: 

1961 """Retrieve the file artifacts associated with the supplied refs. 

1962 

1963 Parameters 

1964 ---------- 

1965 refs : iterable of `DatasetRef` 

1966 The datasets for which file artifacts are to be retrieved. 

1967 A single ref can result in multiple files. The refs must 

1968 be resolved. 

1969 destination : `lsst.resources.ResourcePath` 

1970 Location to write the file artifacts. 

1971 transfer : `str`, optional 

1972 Method to use to transfer the artifacts. Must be one of the options 

1973 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1974 "move" is not allowed. 

1975 preserve_path : `bool`, optional 

1976 If `True` the full path of the file artifact within the datastore 

1977 is preserved. If `False` the final file component of the path 

1978 is used. 

1979 overwrite : `bool`, optional 

1980 If `True` allow transfers to overwrite existing files at the 

1981 destination. 

1982 

1983 Returns 

1984 ------- 

1985 targets : `list` of `lsst.resources.ResourcePath` 

1986 URIs of file artifacts in destination location. Order is not 

1987 preserved. 

1988 """ 

1989 if not destination.isdir(): 

1990 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1991 

1992 if transfer == "move": 

1993 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1994 

1995 # Source -> Destination 

1996 # This also helps filter out duplicate DatasetRef in the request 

1997 # that will map to the same underlying file transfer. 

1998 to_transfer: dict[ResourcePath, ResourcePath] = {} 

1999 

2000 for ref in refs: 

2001 locations = self._get_dataset_locations_info(ref) 

2002 for location, _ in locations: 

2003 source_uri = location.uri 

2004 target_path: ResourcePathExpression 

2005 if preserve_path: 

2006 target_path = location.pathInStore 

2007 if target_path.isabs(): 

2008 # This is an absolute path to an external file. 

2009 # Use the full path. 

2010 target_path = target_path.relativeToPathRoot 

2011 else: 

2012 target_path = source_uri.basename() 

2013 target_uri = destination.join(target_path) 

2014 to_transfer[source_uri] = target_uri 

2015 

2016 # In theory can now parallelize the transfer 

2017 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

2018 for source_uri, target_uri in to_transfer.items(): 

2019 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

2020 

2021 return list(to_transfer.values()) 

2022 

2023 def get( 

2024 self, 

2025 ref: DatasetRef, 

2026 parameters: Mapping[str, Any] | None = None, 

2027 storageClass: StorageClass | str | None = None, 

2028 ) -> Any: 

2029 """Load an InMemoryDataset from the store. 

2030 

2031 Parameters 

2032 ---------- 

2033 ref : `DatasetRef` 

2034 Reference to the required Dataset. 

2035 parameters : `dict` 

2036 `StorageClass`-specific parameters that specify, for example, 

2037 a slice of the dataset to be loaded. 

2038 storageClass : `StorageClass` or `str`, optional 

2039 The storage class to be used to override the Python type 

2040 returned by this method. By default the returned type matches 

2041 the dataset type definition for this dataset. Specifying a 

2042 read `StorageClass` can force a different type to be returned. 

2043 This type must be compatible with the original type. 

2044 

2045 Returns 

2046 ------- 

2047 inMemoryDataset : `object` 

2048 Requested dataset or slice thereof as an InMemoryDataset. 

2049 

2050 Raises 

2051 ------ 

2052 FileNotFoundError 

2053 Requested dataset can not be retrieved. 

2054 TypeError 

2055 Return value from formatter has unexpected type. 

2056 ValueError 

2057 Formatter failed to process the dataset. 

2058 """ 

2059 # Supplied storage class for the component being read is either 

2060 # from the ref itself or some an override if we want to force 

2061 # type conversion. 

2062 if storageClass is not None: 

2063 ref = ref.overrideStorageClass(storageClass) 

2064 refStorageClass = ref.datasetType.storageClass 

2065 

2066 allGetInfo = self._prepare_for_get(ref, parameters) 

2067 refComponent = ref.datasetType.component() 

2068 

2069 # Create mapping from component name to related info 

2070 allComponents = {i.component: i for i in allGetInfo} 

2071 

2072 # By definition the dataset is disassembled if we have more 

2073 # than one record for it. 

2074 isDisassembled = len(allGetInfo) > 1 

2075 

2076 # Look for the special case where we are disassembled but the 

2077 # component is a derived component that was not written during 

2078 # disassembly. For this scenario we need to check that the 

2079 # component requested is listed as a derived component for the 

2080 # composite storage class 

2081 isDisassembledReadOnlyComponent = False 

2082 if isDisassembled and refComponent: 

2083 # The composite storage class should be accessible through 

2084 # the component dataset type 

2085 compositeStorageClass = ref.datasetType.parentStorageClass 

2086 

2087 # In the unlikely scenario where the composite storage 

2088 # class is not known, we can only assume that this is a 

2089 # normal component. If that assumption is wrong then the 

2090 # branch below that reads a persisted component will fail 

2091 # so there is no need to complain here. 

2092 if compositeStorageClass is not None: 

2093 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2094 

2095 if isDisassembled and not refComponent: 

2096 # This was a disassembled dataset spread over multiple files 

2097 # and we need to put them all back together again. 

2098 # Read into memory and then assemble 

2099 

2100 # Check that the supplied parameters are suitable for the type read 

2101 refStorageClass.validateParameters(parameters) 

2102 

2103 # We want to keep track of all the parameters that were not used 

2104 # by formatters. We assume that if any of the component formatters 

2105 # use a parameter that we do not need to apply it again in the 

2106 # assembler. 

2107 usedParams = set() 

2108 

2109 components: dict[str, Any] = {} 

2110 for getInfo in allGetInfo: 

2111 # assemblerParams are parameters not understood by the 

2112 # associated formatter. 

2113 usedParams.update(set(getInfo.formatterParams)) 

2114 

2115 component = getInfo.component 

2116 

2117 if component is None: 

2118 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2119 

2120 # We do not want the formatter to think it's reading 

2121 # a component though because it is really reading a 

2122 # standalone dataset -- always tell reader it is not a 

2123 # component. 

2124 components[component] = self._read_artifact_into_memory( 

2125 getInfo, ref.makeComponentRef(component), isComponent=False 

2126 ) 

2127 

2128 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2129 

2130 # Any unused parameters will have to be passed to the assembler 

2131 if parameters: 

2132 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2133 else: 

2134 unusedParams = {} 

2135 

2136 # Process parameters 

2137 return ref.datasetType.storageClass.delegate().handleParameters( 

2138 inMemoryDataset, parameters=unusedParams 

2139 ) 

2140 

2141 elif isDisassembledReadOnlyComponent: 

2142 compositeStorageClass = ref.datasetType.parentStorageClass 

2143 if compositeStorageClass is None: 

2144 raise RuntimeError( 

2145 f"Unable to retrieve derived component '{refComponent}' since" 

2146 "no composite storage class is available." 

2147 ) 

2148 

2149 if refComponent is None: 

2150 # Mainly for mypy 

2151 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2152 

2153 # Assume that every derived component can be calculated by 

2154 # forwarding the request to a single read/write component. 

2155 # Rather than guessing which rw component is the right one by 

2156 # scanning each for a derived component of the same name, 

2157 # we ask the storage class delegate directly which one is best to 

2158 # use. 

2159 compositeDelegate = compositeStorageClass.delegate() 

2160 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2161 refComponent, set(allComponents) 

2162 ) 

2163 

2164 # Select the relevant component 

2165 rwInfo = allComponents[forwardedComponent] 

2166 

2167 # For now assume that read parameters are validated against 

2168 # the real component and not the requested component 

2169 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2170 forwardedStorageClass.validateParameters(parameters) 

2171 

2172 # The reference to use for the caching must refer to the forwarded 

2173 # component and not the derived component. 

2174 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2175 

2176 # Unfortunately the FileDescriptor inside the formatter will have 

2177 # the wrong write storage class so we need to create a new one 

2178 # given the immutability constraint. 

2179 writeStorageClass = rwInfo.info.storageClass 

2180 

2181 # We may need to put some thought into parameters for read 

2182 # components but for now forward them on as is 

2183 readFormatter = type(rwInfo.formatter)( 

2184 FileDescriptor( 

2185 rwInfo.location, 

2186 readStorageClass=refStorageClass, 

2187 storageClass=writeStorageClass, 

2188 parameters=parameters, 

2189 ), 

2190 ref.dataId, 

2191 ) 

2192 

2193 # The assembler can not receive any parameter requests for a 

2194 # derived component at this time since the assembler will 

2195 # see the storage class of the derived component and those 

2196 # parameters will have to be handled by the formatter on the 

2197 # forwarded storage class. 

2198 assemblerParams: dict[str, Any] = {} 

2199 

2200 # Need to created a new info that specifies the derived 

2201 # component and associated storage class 

2202 readInfo = DatastoreFileGetInformation( 

2203 rwInfo.location, 

2204 readFormatter, 

2205 rwInfo.info, 

2206 assemblerParams, 

2207 {}, 

2208 refComponent, 

2209 refStorageClass, 

2210 ) 

2211 

2212 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2213 

2214 else: 

2215 # Single file request or component from that composite file 

2216 for lookup in (refComponent, None): 

2217 if lookup in allComponents: 

2218 getInfo = allComponents[lookup] 

2219 break 

2220 else: 

2221 raise FileNotFoundError( 

2222 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2223 ) 

2224 

2225 # Do not need the component itself if already disassembled 

2226 if isDisassembled: 

2227 isComponent = False 

2228 else: 

2229 isComponent = getInfo.component is not None 

2230 

2231 # For a component read of a composite we want the cache to 

2232 # be looking at the composite ref itself. 

2233 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2234 

2235 # For a disassembled component we can validate parametersagainst 

2236 # the component storage class directly 

2237 if isDisassembled: 

2238 refStorageClass.validateParameters(parameters) 

2239 else: 

2240 # For an assembled composite this could be a derived 

2241 # component derived from a real component. The validity 

2242 # of the parameters is not clear. For now validate against 

2243 # the composite storage class 

2244 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2245 

2246 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2247 

2248 @transactional 

2249 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2250 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2251 

2252 Parameters 

2253 ---------- 

2254 inMemoryDataset : `object` 

2255 The dataset to store. 

2256 ref : `DatasetRef` 

2257 Reference to the associated Dataset. 

2258 

2259 Raises 

2260 ------ 

2261 TypeError 

2262 Supplied object and storage class are inconsistent. 

2263 DatasetTypeNotSupportedError 

2264 The associated `DatasetType` is not handled by this datastore. 

2265 

2266 Notes 

2267 ----- 

2268 If the datastore is configured to reject certain dataset types it 

2269 is possible that the put will fail and raise a 

2270 `DatasetTypeNotSupportedError`. The main use case for this is to 

2271 allow `ChainedDatastore` to put to multiple datastores without 

2272 requiring that every datastore accepts the dataset. 

2273 """ 

2274 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2275 # doDisassembly = True 

2276 

2277 artifacts = [] 

2278 if doDisassembly: 

2279 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2280 if components is None: 

2281 raise RuntimeError( 

2282 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2283 f"with storage class {ref.datasetType.storageClass.name} " 

2284 "is configured to be disassembled, but cannot be." 

2285 ) 

2286 for component, componentInfo in components.items(): 

2287 # Don't recurse because we want to take advantage of 

2288 # bulk insert -- need a new DatasetRef that refers to the 

2289 # same dataset_id but has the component DatasetType 

2290 # DatasetType does not refer to the types of components 

2291 # So we construct one ourselves. 

2292 compRef = ref.makeComponentRef(component) 

2293 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2294 artifacts.append((compRef, storedInfo)) 

2295 else: 

2296 # Write the entire thing out 

2297 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2298 artifacts.append((ref, storedInfo)) 

2299 

2300 self._register_datasets(artifacts) 

2301 

2302 @transactional 

2303 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2304 # At this point can safely remove these datasets from the cache 

2305 # to avoid confusion later on. If they are not trashed later 

2306 # the cache will simply be refilled. 

2307 self.cacheManager.remove_from_cache(ref) 

2308 

2309 # If we are in trust mode there will be nothing to move to 

2310 # the trash table and we will have to try to delete the file 

2311 # immediately. 

2312 if self.trustGetRequest: 

2313 # Try to keep the logic below for a single file trash. 

2314 if isinstance(ref, DatasetRef): 

2315 refs = {ref} 

2316 else: 

2317 # Will recreate ref at the end of this branch. 

2318 refs = set(ref) 

2319 

2320 # Determine which datasets are known to datastore directly. 

2321 id_to_ref = {ref.id: ref for ref in refs} 

2322 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2323 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2324 

2325 missing = refs - existing_refs 

2326 if missing: 

2327 # Do an explicit existence check on these refs. 

2328 # We only care about the artifacts at this point and not 

2329 # the dataset existence. 

2330 artifact_existence: dict[ResourcePath, bool] = {} 

2331 _ = self.mexists(missing, artifact_existence) 

2332 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2333 

2334 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2335 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2336 for uri in uris: 

2337 try: 

2338 uri.remove() 

2339 except Exception as e: 

2340 if ignore_errors: 

2341 log.debug("Artifact %s could not be removed: %s", uri, e) 

2342 continue 

2343 raise 

2344 

2345 # There is no point asking the code below to remove refs we 

2346 # know are missing so update it with the list of existing 

2347 # records. Try to retain one vs many logic. 

2348 if not existing_refs: 

2349 # Nothing more to do since none of the datasets were 

2350 # known to the datastore record table. 

2351 return 

2352 ref = list(existing_refs) 

2353 if len(ref) == 1: 

2354 ref = ref[0] 

2355 

2356 # Get file metadata and internal metadata 

2357 if not isinstance(ref, DatasetRef): 

2358 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2359 # Assumed to be an iterable of refs so bulk mode enabled. 

2360 try: 

2361 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2362 except Exception as e: 

2363 if ignore_errors: 

2364 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2365 else: 

2366 raise 

2367 return 

2368 

2369 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2370 

2371 fileLocations = self._get_dataset_locations_info(ref) 

2372 

2373 if not fileLocations: 

2374 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2375 if ignore_errors: 

2376 log.warning(err_msg) 

2377 return 

2378 else: 

2379 raise FileNotFoundError(err_msg) 

2380 

2381 for location, storedFileInfo in fileLocations: 

2382 if not self._artifact_exists(location): 

2383 err_msg = ( 

2384 f"Dataset is known to datastore {self.name} but " 

2385 f"associated artifact ({location.uri}) is missing" 

2386 ) 

2387 if ignore_errors: 

2388 log.warning(err_msg) 

2389 return 

2390 else: 

2391 raise FileNotFoundError(err_msg) 

2392 

2393 # Mark dataset as trashed 

2394 try: 

2395 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2396 except Exception as e: 

2397 if ignore_errors: 

2398 log.warning( 

2399 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2400 "but encountered an error: %s", 

2401 ref, 

2402 self.name, 

2403 e, 

2404 ) 

2405 pass 

2406 else: 

2407 raise 

2408 

2409 @transactional 

2410 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2411 """Remove all datasets from the trash. 

2412 

2413 Parameters 

2414 ---------- 

2415 ignore_errors : `bool` 

2416 If `True` return without error even if something went wrong. 

2417 Problems could occur if another process is simultaneously trying 

2418 to delete. 

2419 """ 

2420 log.debug("Emptying trash in datastore %s", self.name) 

2421 

2422 # Context manager will empty trash iff we finish it without raising. 

2423 # It will also automatically delete the relevant rows from the 

2424 # trash table and the records table. 

2425 with self.bridge.emptyTrash( 

2426 self._table, record_class=StoredFileInfo, record_column="path" 

2427 ) as trash_data: 

2428 # Removing the artifacts themselves requires that the files are 

2429 # not also associated with refs that are not to be trashed. 

2430 # Therefore need to do a query with the file paths themselves 

2431 # and return all the refs associated with them. Can only delete 

2432 # a file if the refs to be trashed are the only refs associated 

2433 # with the file. 

2434 # This requires multiple copies of the trashed items 

2435 trashed, artifacts_to_keep = trash_data 

2436 

2437 if artifacts_to_keep is None: 

2438 # The bridge is not helping us so have to work it out 

2439 # ourselves. This is not going to be as efficient. 

2440 trashed = list(trashed) 

2441 

2442 # The instance check is for mypy since up to this point it 

2443 # does not know the type of info. 

2444 path_map = self._refs_associated_with_artifacts( 

2445 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2446 ) 

2447 

2448 for ref, info in trashed: 

2449 # Mypy needs to know this is not the base class 

2450 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2451 

2452 path_map[info.path].remove(ref.id) 

2453 if not path_map[info.path]: 

2454 del path_map[info.path] 

2455 

2456 artifacts_to_keep = set(path_map) 

2457 

2458 for ref, info in trashed: 

2459 # Should not happen for this implementation but need 

2460 # to keep mypy happy. 

2461 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2462 

2463 # Mypy needs to know this is not the base class 

2464 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2465 

2466 if info.path in artifacts_to_keep: 

2467 # This is a multi-dataset artifact and we are not 

2468 # removing all associated refs. 

2469 continue 

2470 

2471 # Only trashed refs still known to datastore will be returned. 

2472 location = info.file_location(self.locationFactory) 

2473 

2474 # Point of no return for this artifact 

2475 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2476 try: 

2477 self._delete_artifact(location) 

2478 except FileNotFoundError: 

2479 # If the file itself has been deleted there is nothing 

2480 # we can do about it. It is possible that trash has 

2481 # been run in parallel in another process or someone 

2482 # decided to delete the file. It is unlikely to come 

2483 # back and so we should still continue with the removal 

2484 # of the entry from the trash table. It is also possible 

2485 # we removed it in a previous iteration if it was 

2486 # a multi-dataset artifact. The delete artifact method 

2487 # will log a debug message in this scenario. 

2488 # Distinguishing file missing before trash started and 

2489 # file already removed previously as part of this trash 

2490 # is not worth the distinction with regards to potential 

2491 # memory cost. 

2492 pass 

2493 except Exception as e: 

2494 if ignore_errors: 

2495 # Use a debug message here even though it's not 

2496 # a good situation. In some cases this can be 

2497 # caused by a race between user A and user B 

2498 # and neither of them has permissions for the 

2499 # other's files. Butler does not know about users 

2500 # and trash has no idea what collections these 

2501 # files were in (without guessing from a path). 

2502 log.debug( 

2503 "Encountered error removing artifact %s from datastore %s: %s", 

2504 location.uri, 

2505 self.name, 

2506 e, 

2507 ) 

2508 else: 

2509 raise 

2510 

2511 @transactional 

2512 def transfer_from( 

2513 self, 

2514 source_datastore: Datastore, 

2515 refs: Iterable[DatasetRef], 

2516 transfer: str = "auto", 

2517 artifact_existence: dict[ResourcePath, bool] | None = None, 

2518 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2519 # Docstring inherited 

2520 if type(self) is not type(source_datastore): 

2521 raise TypeError( 

2522 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2523 f"source datastore ({type(source_datastore)})." 

2524 ) 

2525 

2526 # Be explicit for mypy 

2527 if not isinstance(source_datastore, FileDatastore): 

2528 raise TypeError( 

2529 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2530 f" {type(source_datastore)}" 

2531 ) 

2532 

2533 # Stop early if "direct" transfer mode is requested. That would 

2534 # require that the URI inside the source datastore should be stored 

2535 # directly in the target datastore, which seems unlikely to be useful 

2536 # since at any moment the source datastore could delete the file. 

2537 if transfer in ("direct", "split"): 

2538 raise ValueError( 

2539 f"Can not transfer from a source datastore using {transfer} mode since" 

2540 " those files are controlled by the other datastore." 

2541 ) 

2542 

2543 # Empty existence lookup if none given. 

2544 if artifact_existence is None: 

2545 artifact_existence = {} 

2546 

2547 # We will go through the list multiple times so must convert 

2548 # generators to lists. 

2549 refs = list(refs) 

2550 

2551 # In order to handle disassembled composites the code works 

2552 # at the records level since it can assume that internal APIs 

2553 # can be used. 

2554 # - If the record already exists in the destination this is assumed 

2555 # to be okay. 

2556 # - If there is no record but the source and destination URIs are 

2557 # identical no transfer is done but the record is added. 

2558 # - If the source record refers to an absolute URI currently assume 

2559 # that that URI should remain absolute and will be visible to the 

2560 # destination butler. May need to have a flag to indicate whether 

2561 # the dataset should be transferred. This will only happen if 

2562 # the detached Butler has had a local ingest. 

2563 

2564 # What we really want is all the records in the source datastore 

2565 # associated with these refs. Or derived ones if they don't exist 

2566 # in the source. 

2567 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2568 

2569 # The source dataset_ids are the keys in these records 

2570 source_ids = set(source_records) 

2571 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2572 

2573 requested_ids = {ref.id for ref in refs} 

2574 missing_ids = requested_ids - source_ids 

2575 

2576 # Missing IDs can be okay if that datastore has allowed 

2577 # gets based on file existence. Should we transfer what we can 

2578 # or complain about it and warn? 

2579 if missing_ids and not source_datastore.trustGetRequest: 

2580 raise ValueError( 

2581 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2582 ) 

2583 

2584 # Need to map these missing IDs to a DatasetRef so we can guess 

2585 # the details. 

2586 if missing_ids: 

2587 log.info( 

2588 "Number of expected datasets missing from source datastore records: %d out of %d", 

2589 len(missing_ids), 

2590 len(requested_ids), 

2591 ) 

2592 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2593 

2594 # This should be chunked in case we end up having to check 

2595 # the file store since we need some log output to show 

2596 # progress. 

2597 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2598 records = {} 

2599 for missing in missing_ids_chunk: 

2600 # Ask the source datastore where the missing artifacts 

2601 # should be. An execution butler might not know about the 

2602 # artifacts even if they are there. 

2603 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2604 records[missing] = [info for _, info in expected] 

2605 

2606 # Call the mexist helper method in case we have not already 

2607 # checked these artifacts such that artifact_existence is 

2608 # empty. This allows us to benefit from parallelism. 

2609 # datastore.mexists() itself does not give us access to the 

2610 # derived datastore record. 

2611 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2612 ref_exists = source_datastore._process_mexists_records( 

2613 id_to_ref, records, False, artifact_existence=artifact_existence 

2614 ) 

2615 

2616 # Now go through the records and propagate the ones that exist. 

2617 location_factory = source_datastore.locationFactory 

2618 for missing, record_list in records.items(): 

2619 # Skip completely if the ref does not exist. 

2620 ref = id_to_ref[missing] 

2621 if not ref_exists[ref]: 

2622 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2623 continue 

2624 # Check for file artifact to decide which parts of a 

2625 # disassembled composite do exist. If there is only a 

2626 # single record we don't even need to look because it can't 

2627 # be a composite and must exist. 

2628 if len(record_list) == 1: 

2629 dataset_records = record_list 

2630 else: 

2631 dataset_records = [ 

2632 record 

2633 for record in record_list 

2634 if artifact_existence[record.file_location(location_factory).uri] 

2635 ] 

2636 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2637 

2638 # Rely on source_records being a defaultdict. 

2639 source_records[missing].extend(dataset_records) 

2640 

2641 # See if we already have these records 

2642 target_records = self._get_stored_records_associated_with_refs(refs) 

2643 

2644 # The artifacts to register 

2645 artifacts = [] 

2646 

2647 # Refs that already exist 

2648 already_present = [] 

2649 

2650 # Refs that were rejected by this datastore. 

2651 rejected = set() 

2652 

2653 # Refs that were transferred successfully. 

2654 accepted = set() 

2655 

2656 # Record each time we have done a "direct" transfer. 

2657 direct_transfers = [] 

2658 

2659 # Now can transfer the artifacts 

2660 for ref in refs: 

2661 if not self.constraints.isAcceptable(ref): 

2662 # This datastore should not be accepting this dataset. 

2663 rejected.add(ref) 

2664 continue 

2665 

2666 accepted.add(ref) 

2667 

2668 if ref.id in target_records: 

2669 # Already have an artifact for this. 

2670 already_present.append(ref) 

2671 continue 

2672 

2673 # mypy needs to know these are always resolved refs 

2674 for info in source_records[ref.id]: 

2675 source_location = info.file_location(source_datastore.locationFactory) 

2676 target_location = info.file_location(self.locationFactory) 

2677 if source_location == target_location and not source_location.pathInStore.isabs(): 

2678 # Artifact is already in the target location. 

2679 # (which is how execution butler currently runs) 

2680 pass 

2681 else: 

2682 if target_location.pathInStore.isabs(): 

2683 # Just because we can see the artifact when running 

2684 # the transfer doesn't mean it will be generally 

2685 # accessible to a user of this butler. Need to decide 

2686 # what to do about an absolute path. 

2687 if transfer == "auto": 

2688 # For "auto" transfers we allow the absolute URI 

2689 # to be recorded in the target datastore. 

2690 direct_transfers.append(source_location) 

2691 else: 

2692 # The user is explicitly requesting a transfer 

2693 # even for an absolute URI. This requires us to 

2694 # calculate the target path. 

2695 template_ref = ref 

2696 if info.component: 

2697 template_ref = ref.makeComponentRef(info.component) 

2698 target_location = self._calculate_ingested_datastore_name( 

2699 source_location.uri, 

2700 template_ref, 

2701 ) 

2702 

2703 info = info.update(path=target_location.pathInStore.path) 

2704 

2705 # Need to transfer it to the new location. 

2706 # Assume we should always overwrite. If the artifact 

2707 # is there this might indicate that a previous transfer 

2708 # was interrupted but was not able to be rolled back 

2709 # completely (eg pre-emption) so follow Datastore default 

2710 # and overwrite. 

2711 target_location.uri.transfer_from( 

2712 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2713 ) 

2714 

2715 artifacts.append((ref, info)) 

2716 

2717 if direct_transfers: 

2718 log.info( 

2719 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2720 len(direct_transfers), 

2721 "" if len(direct_transfers) == 1 else "s", 

2722 ) 

2723 

2724 self._register_datasets(artifacts) 

2725 

2726 if already_present: 

2727 n_skipped = len(already_present) 

2728 log.info( 

2729 "Skipped transfer of %d dataset%s already present in datastore", 

2730 n_skipped, 

2731 "" if n_skipped == 1 else "s", 

2732 ) 

2733 

2734 return accepted, rejected 

2735 

2736 @transactional 

2737 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2738 # Docstring inherited. 

2739 refs = list(refs) 

2740 self.bridge.forget(refs) 

2741 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2742 

2743 def validateConfiguration( 

2744 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2745 ) -> None: 

2746 """Validate some of the configuration for this datastore. 

2747 

2748 Parameters 

2749 ---------- 

2750 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2751 Entities to test against this configuration. Can be differing 

2752 types. 

2753 logFailures : `bool`, optional 

2754 If `True`, output a log message for every validation error 

2755 detected. 

2756 

2757 Raises 

2758 ------ 

2759 DatastoreValidationError 

2760 Raised if there is a validation problem with a configuration. 

2761 All the problems are reported in a single exception. 

2762 

2763 Notes 

2764 ----- 

2765 This method checks that all the supplied entities have valid file 

2766 templates and also have formatters defined. 

2767 """ 

2768 templateFailed = None 

2769 try: 

2770 self.templates.validateTemplates(entities, logFailures=logFailures) 

2771 except FileTemplateValidationError as e: 

2772 templateFailed = str(e) 

2773 

2774 formatterFailed = [] 

2775 for entity in entities: 

2776 try: 

2777 self.formatterFactory.getFormatterClass(entity) 

2778 except KeyError as e: 

2779 formatterFailed.append(str(e)) 

2780 if logFailures: 

2781 log.critical("Formatter failure: %s", e) 

2782 

2783 if templateFailed or formatterFailed: 

2784 messages = [] 

2785 if templateFailed: 

2786 messages.append(templateFailed) 

2787 if formatterFailed: 

2788 messages.append(",".join(formatterFailed)) 

2789 msg = ";\n".join(messages) 

2790 raise DatastoreValidationError(msg) 

2791 

2792 def getLookupKeys(self) -> set[LookupKey]: 

2793 # Docstring is inherited from base class 

2794 return ( 

2795 self.templates.getLookupKeys() 

2796 | self.formatterFactory.getLookupKeys() 

2797 | self.constraints.getLookupKeys() 

2798 ) 

2799 

2800 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2801 # Docstring is inherited from base class 

2802 # The key can be valid in either formatters or templates so we can 

2803 # only check the template if it exists 

2804 if lookupKey in self.templates: 

2805 try: 

2806 self.templates[lookupKey].validateTemplate(entity) 

2807 except FileTemplateValidationError as e: 

2808 raise DatastoreValidationError(e) from e 

2809 

2810 def export( 

2811 self, 

2812 refs: Iterable[DatasetRef], 

2813 *, 

2814 directory: ResourcePathExpression | None = None, 

2815 transfer: str | None = "auto", 

2816 ) -> Iterable[FileDataset]: 

2817 # Docstring inherited from Datastore.export. 

2818 if transfer == "auto" and directory is None: 

2819 transfer = None 

2820 

2821 if transfer is not None and directory is None: 

2822 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2823 

2824 if transfer == "move": 

2825 raise TypeError("Can not export by moving files out of datastore.") 

2826 elif transfer == "direct": 

2827 # For an export, treat this as equivalent to None. We do not 

2828 # want an import to risk using absolute URIs to datasets owned 

2829 # by another datastore. 

2830 log.info("Treating 'direct' transfer mode as in-place export.") 

2831 transfer = None 

2832 

2833 # Force the directory to be a URI object 

2834 directoryUri: ResourcePath | None = None 

2835 if directory is not None: 

2836 directoryUri = ResourcePath(directory, forceDirectory=True) 

2837 

2838 if transfer is not None and directoryUri is not None: 

2839 # mypy needs the second test 

2840 if not directoryUri.exists(): 

2841 raise FileNotFoundError(f"Export location {directory} does not exist") 

2842 

2843 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2844 for ref in progress.wrap(refs, "Exporting dataset files"): 

2845 fileLocations = self._get_dataset_locations_info(ref) 

2846 if not fileLocations: 

2847 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2848 # For now we can not export disassembled datasets 

2849 if len(fileLocations) > 1: 

2850 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2851 location, storedFileInfo = fileLocations[0] 

2852 

2853 pathInStore = location.pathInStore.path 

2854 if transfer is None: 

2855 # TODO: do we also need to return the readStorageClass somehow? 

2856 # We will use the path in store directly. If this is an 

2857 # absolute URI, preserve it. 

2858 if location.pathInStore.isabs(): 

2859 pathInStore = str(location.uri) 

2860 elif transfer == "direct": 

2861 # Use full URIs to the remote store in the export 

2862 pathInStore = str(location.uri) 

2863 else: 

2864 # mypy needs help 

2865 assert directoryUri is not None, "directoryUri must be defined to get here" 

2866 storeUri = ResourcePath(location.uri) 

2867 

2868 # if the datastore has an absolute URI to a resource, we 

2869 # have two options: 

2870 # 1. Keep the absolute URI in the exported YAML 

2871 # 2. Allocate a new name in the local datastore and transfer 

2872 # it. 

2873 # For now go with option 2 

2874 if location.pathInStore.isabs(): 

2875 template = self.templates.getTemplate(ref) 

2876 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2877 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2878 

2879 exportUri = directoryUri.join(pathInStore) 

2880 exportUri.transfer_from(storeUri, transfer=transfer) 

2881 

2882 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2883 

2884 @staticmethod 

2885 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2886 """Compute the checksum of the supplied file. 

2887 

2888 Parameters 

2889 ---------- 

2890 uri : `lsst.resources.ResourcePath` 

2891 Name of resource to calculate checksum from. 

2892 algorithm : `str`, optional 

2893 Name of algorithm to use. Must be one of the algorithms supported 

2894 by :py:class`hashlib`. 

2895 block_size : `int` 

2896 Number of bytes to read from file at one time. 

2897 

2898 Returns 

2899 ------- 

2900 hexdigest : `str` 

2901 Hex digest of the file. 

2902 

2903 Notes 

2904 ----- 

2905 Currently returns None if the URI is for a remote resource. 

2906 """ 

2907 if algorithm not in hashlib.algorithms_guaranteed: 

2908 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2909 

2910 if not uri.isLocal: 

2911 return None 

2912 

2913 hasher = hashlib.new(algorithm) 

2914 

2915 with uri.as_local() as local_uri: 

2916 with open(local_uri.ospath, "rb") as f: 

2917 for chunk in iter(lambda: f.read(block_size), b""): 

2918 hasher.update(chunk) 

2919 

2920 return hasher.hexdigest() 

2921 

2922 def needs_expanded_data_ids( 

2923 self, 

2924 transfer: str | None, 

2925 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2926 ) -> bool: 

2927 # Docstring inherited. 

2928 # This _could_ also use entity to inspect whether the filename template 

2929 # involves placeholders other than the required dimensions for its 

2930 # dataset type, but that's not necessary for correctness; it just 

2931 # enables more optimizations (perhaps only in theory). 

2932 return transfer not in ("direct", None) 

2933 

2934 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2935 # Docstring inherited from the base class. 

2936 record_data = data.get(self.name) 

2937 if not record_data: 

2938 return 

2939 

2940 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys()) 

2941 

2942 # TODO: Verify that there are no unexpected table names in the dict? 

2943 unpacked_records = [] 

2944 for dataset_data in record_data.records.values(): 

2945 records = dataset_data.get(self._table.name) 

2946 if records: 

2947 for info in records: 

2948 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2949 unpacked_records.append(info.to_record()) 

2950 if unpacked_records: 

2951 self._table.insert(*unpacked_records, transaction=self._transaction) 

2952 

2953 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2954 # Docstring inherited from the base class. 

2955 exported_refs = list(self._bridge.check(refs)) 

2956 ids = {ref.id for ref in exported_refs} 

2957 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

2958 for row in self._table.fetch(dataset_id=ids): 

2959 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2960 dataset_records = records.setdefault(info.dataset_id, {}) 

2961 dataset_records.setdefault(self._table.name, []).append(info) 

2962 

2963 record_data = DatastoreRecordData(records=records) 

2964 return {self.name: record_data} 

2965 

2966 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2967 # Docstring inherited from the base class. 

2968 self._retrieve_dataset_method = method 

2969 

2970 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2971 """Update dataset reference to use the storage class from registry.""" 

2972 if self._retrieve_dataset_method is None: 

2973 # We could raise an exception here but unit tests do not define 

2974 # this method. 

2975 return ref 

2976 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2977 if dataset_type is not None: 

2978 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2979 return ref