Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 9%

977 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-12 10:56 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Generic file-based datastore code.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("FileDatastore",) 

27 

28import hashlib 

29import logging 

30from collections import defaultdict 

31from collections.abc import Callable, Iterable, Mapping, Sequence 

32from dataclasses import dataclass 

33from typing import TYPE_CHECKING, Any, ClassVar 

34 

35from lsst.daf.butler import ( 

36 CompositesMap, 

37 Config, 

38 DatasetId, 

39 DatasetRef, 

40 DatasetRefURIs, 

41 DatasetType, 

42 DatasetTypeNotSupportedError, 

43 Datastore, 

44 DatastoreCacheManager, 

45 DatastoreConfig, 

46 DatastoreDisabledCacheManager, 

47 DatastoreRecordData, 

48 DatastoreValidationError, 

49 FileDataset, 

50 FileDescriptor, 

51 FileTemplates, 

52 FileTemplateValidationError, 

53 Formatter, 

54 FormatterFactory, 

55 Location, 

56 LocationFactory, 

57 Progress, 

58 StorageClass, 

59 StoredDatastoreItemInfo, 

60 StoredFileInfo, 

61 ddl, 

62) 

63from lsst.daf.butler.core.repoRelocation import replaceRoot 

64from lsst.daf.butler.core.utils import transactional 

65from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

66from lsst.resources import ResourcePath, ResourcePathExpression 

67from lsst.utils.introspection import get_class_of, get_instance_of 

68from lsst.utils.iteration import chunk_iterable 

69 

70# For VERBOSE logging usage. 

71from lsst.utils.logging import VERBOSE, getLogger 

72from lsst.utils.timer import time_this 

73from sqlalchemy import BigInteger, String 

74 

75from ..registry.interfaces import FakeDatasetRef 

76from .genericDatastore import GenericBaseDatastore 

77 

78if TYPE_CHECKING: 

79 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

80 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

81 

82log = getLogger(__name__) 

83 

84 

85class _IngestPrepData(Datastore.IngestPrepData): 

86 """Helper class for FileDatastore ingest implementation. 

87 

88 Parameters 

89 ---------- 

90 datasets : `~collections.abc.Iterable` of `FileDataset` 

91 Files to be ingested by this datastore. 

92 """ 

93 

94 def __init__(self, datasets: Iterable[FileDataset]): 

95 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

96 self.datasets = datasets 

97 

98 

99@dataclass(frozen=True) 

100class DatastoreFileGetInformation: 

101 """Collection of useful parameters needed to retrieve a file from 

102 a Datastore. 

103 """ 

104 

105 location: Location 

106 """The location from which to read the dataset.""" 

107 

108 formatter: Formatter 

109 """The `Formatter` to use to deserialize the dataset.""" 

110 

111 info: StoredFileInfo 

112 """Stored information about this file and its formatter.""" 

113 

114 assemblerParams: Mapping[str, Any] 

115 """Parameters to use for post-processing the retrieved dataset.""" 

116 

117 formatterParams: Mapping[str, Any] 

118 """Parameters that were understood by the associated formatter.""" 

119 

120 component: str | None 

121 """The component to be retrieved (can be `None`).""" 

122 

123 readStorageClass: StorageClass 

124 """The `StorageClass` of the dataset being read.""" 

125 

126 

127class FileDatastore(GenericBaseDatastore): 

128 """Generic Datastore for file-based implementations. 

129 

130 Should always be sub-classed since key abstract methods are missing. 

131 

132 Parameters 

133 ---------- 

134 config : `DatastoreConfig` or `str` 

135 Configuration as either a `Config` object or URI to file. 

136 bridgeManager : `DatastoreRegistryBridgeManager` 

137 Object that manages the interface between `Registry` and datastores. 

138 butlerRoot : `str`, optional 

139 New datastore root to use to override the configuration value. 

140 

141 Raises 

142 ------ 

143 ValueError 

144 If root location does not exist and ``create`` is `False` in the 

145 configuration. 

146 """ 

147 

148 defaultConfigFile: ClassVar[str | None] = None 

149 """Path to configuration defaults. Accessed within the ``config`` resource 

150 or relative to a search path. Can be None if no defaults specified. 

151 """ 

152 

153 root: ResourcePath 

154 """Root directory URI of this `Datastore`.""" 

155 

156 locationFactory: LocationFactory 

157 """Factory for creating locations relative to the datastore root.""" 

158 

159 formatterFactory: FormatterFactory 

160 """Factory for creating instances of formatters.""" 

161 

162 templates: FileTemplates 

163 """File templates that can be used by this `Datastore`.""" 

164 

165 composites: CompositesMap 

166 """Determines whether a dataset should be disassembled on put.""" 

167 

168 defaultConfigFile = "datastores/fileDatastore.yaml" 

169 """Path to configuration defaults. Accessed within the ``config`` resource 

170 or relative to a search path. Can be None if no defaults specified. 

171 """ 

172 

173 _retrieve_dataset_method: Callable[[str], DatasetType | None] | None = None 

174 """Callable that is used in trusted mode to retrieve registry definition 

175 of a named dataset type. 

176 """ 

177 

178 @classmethod 

179 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

180 """Set any filesystem-dependent config options for this Datastore to 

181 be appropriate for a new empty repository with the given root. 

182 

183 Parameters 

184 ---------- 

185 root : `str` 

186 URI to the root of the data repository. 

187 config : `Config` 

188 A `Config` to update. Only the subset understood by 

189 this component will be updated. Will not expand 

190 defaults. 

191 full : `Config` 

192 A complete config with all defaults expanded that can be 

193 converted to a `DatastoreConfig`. Read-only and will not be 

194 modified by this method. 

195 Repository-specific options that should not be obtained 

196 from defaults when Butler instances are constructed 

197 should be copied from ``full`` to ``config``. 

198 overwrite : `bool`, optional 

199 If `False`, do not modify a value in ``config`` if the value 

200 already exists. Default is always to overwrite with the provided 

201 ``root``. 

202 

203 Notes 

204 ----- 

205 If a keyword is explicitly defined in the supplied ``config`` it 

206 will not be overridden by this method if ``overwrite`` is `False`. 

207 This allows explicit values set in external configs to be retained. 

208 """ 

209 Config.updateParameters( 

210 DatastoreConfig, 

211 config, 

212 full, 

213 toUpdate={"root": root}, 

214 toCopy=("cls", ("records", "table")), 

215 overwrite=overwrite, 

216 ) 

217 

218 @classmethod 

219 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

220 return ddl.TableSpec( 

221 fields=[ 

222 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

223 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

224 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

225 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

226 # Use empty string to indicate no component 

227 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

228 # TODO: should checksum be Base64Bytes instead? 

229 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

230 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

231 ], 

232 unique=frozenset(), 

233 indexes=[ddl.IndexSpec("path")], 

234 ) 

235 

236 def __init__( 

237 self, 

238 config: DatastoreConfig | ResourcePathExpression, 

239 bridgeManager: DatastoreRegistryBridgeManager, 

240 butlerRoot: str | None = None, 

241 ): 

242 super().__init__(config, bridgeManager) 

243 if "root" not in self.config: 

244 raise ValueError("No root directory specified in configuration") 

245 

246 self._bridgeManager = bridgeManager 

247 

248 # Name ourselves either using an explicit name or a name 

249 # derived from the (unexpanded) root 

250 if "name" in self.config: 

251 self.name = self.config["name"] 

252 else: 

253 # We use the unexpanded root in the name to indicate that this 

254 # datastore can be moved without having to update registry. 

255 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

256 

257 # Support repository relocation in config 

258 # Existence of self.root is checked in subclass 

259 self.root = ResourcePath( 

260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

261 ) 

262 

263 self.locationFactory = LocationFactory(self.root) 

264 self.formatterFactory = FormatterFactory() 

265 

266 # Now associate formatters with storage classes 

267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

268 

269 # Read the file naming templates 

270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

271 

272 # See if composites should be disassembled 

273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

274 

275 tableName = self.config["records", "table"] 

276 try: 

277 # Storage of paths and formatters, keyed by dataset_id 

278 self._table = bridgeManager.opaque.register( 

279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

280 ) 

281 # Interface to Registry. 

282 self._bridge = bridgeManager.register(self.name) 

283 except ReadOnlyDatabaseError: 

284 # If the database is read only and we just tried and failed to 

285 # create a table, it means someone is trying to create a read-only 

286 # butler client for an empty repo. That should be okay, as long 

287 # as they then try to get any datasets before some other client 

288 # creates the table. Chances are they'rejust validating 

289 # configuration. 

290 pass 

291 

292 # Determine whether checksums should be used - default to False 

293 self.useChecksum = self.config.get("checksum", False) 

294 

295 # Determine whether we can fall back to configuration if a 

296 # requested dataset is not known to registry 

297 self.trustGetRequest = self.config.get("trust_get_request", False) 

298 

299 # Create a cache manager 

300 self.cacheManager: AbstractDatastoreCacheManager 

301 if "cached" in self.config: 

302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

303 else: 

304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

305 

306 # Check existence and create directory structure if necessary 

307 if not self.root.exists(): 

308 if "create" not in self.config or not self.config["create"]: 

309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

310 try: 

311 self.root.mkdir() 

312 except Exception as e: 

313 raise ValueError( 

314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

315 ) from e 

316 

317 def __str__(self) -> str: 

318 return str(self.root) 

319 

320 @property 

321 def bridge(self) -> DatastoreRegistryBridge: 

322 return self._bridge 

323 

324 @property 

325 def roots(self) -> dict[str, ResourcePath | None]: 

326 # Docstring inherited. 

327 return {self.name: self.root} 

328 

329 def _artifact_exists(self, location: Location) -> bool: 

330 """Check that an artifact exists in this datastore at the specified 

331 location. 

332 

333 Parameters 

334 ---------- 

335 location : `Location` 

336 Expected location of the artifact associated with this datastore. 

337 

338 Returns 

339 ------- 

340 exists : `bool` 

341 True if the location can be found, false otherwise. 

342 """ 

343 log.debug("Checking if resource exists: %s", location.uri) 

344 return location.uri.exists() 

345 

346 def _delete_artifact(self, location: Location) -> None: 

347 """Delete the artifact from the datastore. 

348 

349 Parameters 

350 ---------- 

351 location : `Location` 

352 Location of the artifact associated with this datastore. 

353 """ 

354 if location.pathInStore.isabs(): 

355 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

356 

357 try: 

358 location.uri.remove() 

359 except FileNotFoundError: 

360 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

361 raise 

362 except Exception as e: 

363 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

364 raise 

365 log.debug("Successfully deleted file: %s", location.uri) 

366 

367 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

368 # Docstring inherited from GenericBaseDatastore 

369 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)] 

370 self._table.insert(*records, transaction=self._transaction) 

371 

372 def getStoredItemsInfo(self, ref: DatasetIdRef) -> list[StoredFileInfo]: 

373 # Docstring inherited from GenericBaseDatastore 

374 

375 # Look for the dataset_id -- there might be multiple matches 

376 # if we have disassembled the dataset. 

377 records = self._table.fetch(dataset_id=ref.id) 

378 return [StoredFileInfo.from_record(record) for record in records] 

379 

380 def _get_stored_records_associated_with_refs( 

381 self, refs: Iterable[DatasetIdRef] 

382 ) -> dict[DatasetId, list[StoredFileInfo]]: 

383 """Retrieve all records associated with the provided refs. 

384 

385 Parameters 

386 ---------- 

387 refs : iterable of `DatasetIdRef` 

388 The refs for which records are to be retrieved. 

389 

390 Returns 

391 ------- 

392 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

393 The matching records indexed by the ref ID. The number of entries 

394 in the dict can be smaller than the number of requested refs. 

395 """ 

396 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

397 

398 # Uniqueness is dataset_id + component so can have multiple records 

399 # per ref. 

400 records_by_ref = defaultdict(list) 

401 for record in records: 

402 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

403 return records_by_ref 

404 

405 def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: 

406 """Return paths and associated dataset refs. 

407 

408 Parameters 

409 ---------- 

410 paths : `list` of `str` or `lsst.resources.ResourcePath` 

411 All the paths to include in search. 

412 

413 Returns 

414 ------- 

415 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

416 Mapping of each path to a set of associated database IDs. 

417 """ 

418 records = self._table.fetch(path=[str(path) for path in paths]) 

419 result = defaultdict(set) 

420 for row in records: 

421 result[row["path"]].add(row["dataset_id"]) 

422 return result 

423 

424 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: 

425 """Return all dataset refs associated with the supplied path. 

426 

427 Parameters 

428 ---------- 

429 pathInStore : `lsst.resources.ResourcePath` 

430 Path of interest in the data store. 

431 

432 Returns 

433 ------- 

434 ids : `set` of `int` 

435 All `DatasetRef` IDs associated with this path. 

436 """ 

437 records = list(self._table.fetch(path=str(pathInStore))) 

438 ids = {r["dataset_id"] for r in records} 

439 return ids 

440 

441 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

442 # Docstring inherited from GenericBaseDatastore 

443 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

444 

445 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> list[tuple[Location, StoredFileInfo]]: 

446 r"""Find all the `Location`\ s of the requested dataset in the 

447 `Datastore` and the associated stored file information. 

448 

449 Parameters 

450 ---------- 

451 ref : `DatasetRef` 

452 Reference to the required `Dataset`. 

453 

454 Returns 

455 ------- 

456 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

457 Location of the dataset within the datastore and 

458 stored information about each file and its formatter. 

459 """ 

460 # Get the file information (this will fail if no file) 

461 records = self.getStoredItemsInfo(ref) 

462 

463 # Use the path to determine the location -- we need to take 

464 # into account absolute URIs in the datastore record 

465 return [(r.file_location(self.locationFactory), r) for r in records] 

466 

467 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

468 """Check that there is only one dataset associated with the 

469 specified artifact. 

470 

471 Parameters 

472 ---------- 

473 ref : `DatasetRef` or `FakeDatasetRef` 

474 Dataset to be removed. 

475 location : `Location` 

476 The location of the artifact to be removed. 

477 

478 Returns 

479 ------- 

480 can_remove : `Bool` 

481 True if the artifact can be safely removed. 

482 """ 

483 # Can't ever delete absolute URIs. 

484 if location.pathInStore.isabs(): 

485 return False 

486 

487 # Get all entries associated with this path 

488 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

489 if not allRefs: 

490 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

491 

492 # Remove these refs from all the refs and if there is nothing left 

493 # then we can delete 

494 remainingRefs = allRefs - {ref.id} 

495 

496 if remainingRefs: 

497 return False 

498 return True 

499 

500 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Location, StoredFileInfo]]: 

501 """Predict the location and related file information of the requested 

502 dataset in this datastore. 

503 

504 Parameters 

505 ---------- 

506 ref : `DatasetRef` 

507 Reference to the required `Dataset`. 

508 

509 Returns 

510 ------- 

511 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

512 Expected Location of the dataset within the datastore and 

513 placeholder information about each file and its formatter. 

514 

515 Notes 

516 ----- 

517 Uses the current configuration to determine how we would expect the 

518 datastore files to have been written if we couldn't ask registry. 

519 This is safe so long as there has been no change to datastore 

520 configuration between writing the dataset and wanting to read it. 

521 Will not work for files that have been ingested without using the 

522 standard file template or default formatter. 

523 """ 

524 # If we have a component ref we always need to ask the questions 

525 # of the composite. If the composite is disassembled this routine 

526 # should return all components. If the composite was not 

527 # disassembled the composite is what is stored regardless of 

528 # component request. Note that if the caller has disassembled 

529 # a composite there is no way for this guess to know that 

530 # without trying both the composite and component ref and seeing 

531 # if there is something at the component Location even without 

532 # disassembly being enabled. 

533 if ref.datasetType.isComponent(): 

534 ref = ref.makeCompositeRef() 

535 

536 # See if the ref is a composite that should be disassembled 

537 doDisassembly = self.composites.shouldBeDisassembled(ref) 

538 

539 all_info: list[tuple[Location, Formatter, StorageClass, str | None]] = [] 

540 

541 if doDisassembly: 

542 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

543 compRef = ref.makeComponentRef(component) 

544 location, formatter = self._determine_put_formatter_location(compRef) 

545 all_info.append((location, formatter, componentStorage, component)) 

546 

547 else: 

548 # Always use the composite ref if no disassembly 

549 location, formatter = self._determine_put_formatter_location(ref) 

550 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

551 

552 # Convert the list of tuples to have StoredFileInfo as second element 

553 return [ 

554 ( 

555 location, 

556 StoredFileInfo( 

557 formatter=formatter, 

558 path=location.pathInStore.path, 

559 storageClass=storageClass, 

560 component=component, 

561 checksum=None, 

562 file_size=-1, 

563 dataset_id=ref.id, 

564 ), 

565 ) 

566 for location, formatter, storageClass, component in all_info 

567 ] 

568 

569 def _prepare_for_get( 

570 self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None 

571 ) -> list[DatastoreFileGetInformation]: 

572 """Check parameters for ``get`` and obtain formatter and 

573 location. 

574 

575 Parameters 

576 ---------- 

577 ref : `DatasetRef` 

578 Reference to the required Dataset. 

579 parameters : `dict` 

580 `StorageClass`-specific parameters that specify, for example, 

581 a slice of the dataset to be loaded. 

582 

583 Returns 

584 ------- 

585 getInfo : `list` [`DatastoreFileGetInformation`] 

586 Parameters needed to retrieve each file. 

587 """ 

588 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

589 

590 # The storage class we want to use eventually 

591 refStorageClass = ref.datasetType.storageClass 

592 

593 # For trusted mode need to reset storage class. 

594 ref = self._cast_storage_class(ref) 

595 

596 # Get file metadata and internal metadata 

597 fileLocations = self._get_dataset_locations_info(ref) 

598 if not fileLocations: 

599 if not self.trustGetRequest: 

600 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

601 # Assume the dataset is where we think it should be 

602 fileLocations = self._get_expected_dataset_locations_info(ref) 

603 

604 if len(fileLocations) > 1: 

605 disassembled = True 

606 

607 # If trust is involved it is possible that there will be 

608 # components listed here that do not exist in the datastore. 

609 # Explicitly check for file artifact existence and filter out any 

610 # that are missing. 

611 if self.trustGetRequest: 

612 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

613 

614 # For now complain only if we have no components at all. One 

615 # component is probably a problem but we can punt that to the 

616 # assembler. 

617 if not fileLocations: 

618 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

619 

620 else: 

621 disassembled = False 

622 

623 # Is this a component request? 

624 refComponent = ref.datasetType.component() 

625 

626 fileGetInfo = [] 

627 for location, storedFileInfo in fileLocations: 

628 # The storage class used to write the file 

629 writeStorageClass = storedFileInfo.storageClass 

630 

631 # If this has been disassembled we need read to match the write 

632 if disassembled: 

633 readStorageClass = writeStorageClass 

634 else: 

635 readStorageClass = refStorageClass 

636 

637 formatter = get_instance_of( 

638 storedFileInfo.formatter, 

639 FileDescriptor( 

640 location, 

641 readStorageClass=readStorageClass, 

642 storageClass=writeStorageClass, 

643 parameters=parameters, 

644 ), 

645 ref.dataId, 

646 ) 

647 

648 formatterParams, notFormatterParams = formatter.segregateParameters() 

649 

650 # Of the remaining parameters, extract the ones supported by 

651 # this StorageClass (for components not all will be handled) 

652 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

653 

654 # The ref itself could be a component if the dataset was 

655 # disassembled by butler, or we disassembled in datastore and 

656 # components came from the datastore records 

657 component = storedFileInfo.component if storedFileInfo.component else refComponent 

658 

659 fileGetInfo.append( 

660 DatastoreFileGetInformation( 

661 location, 

662 formatter, 

663 storedFileInfo, 

664 assemblerParams, 

665 formatterParams, 

666 component, 

667 readStorageClass, 

668 ) 

669 ) 

670 

671 return fileGetInfo 

672 

673 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: 

674 """Check the arguments for ``put`` and obtain formatter and 

675 location. 

676 

677 Parameters 

678 ---------- 

679 inMemoryDataset : `object` 

680 The dataset to store. 

681 ref : `DatasetRef` 

682 Reference to the associated Dataset. 

683 

684 Returns 

685 ------- 

686 location : `Location` 

687 The location to write the dataset. 

688 formatter : `Formatter` 

689 The `Formatter` to use to write the dataset. 

690 

691 Raises 

692 ------ 

693 TypeError 

694 Supplied object and storage class are inconsistent. 

695 DatasetTypeNotSupportedError 

696 The associated `DatasetType` is not handled by this datastore. 

697 """ 

698 self._validate_put_parameters(inMemoryDataset, ref) 

699 return self._determine_put_formatter_location(ref) 

700 

701 def _determine_put_formatter_location(self, ref: DatasetRef) -> tuple[Location, Formatter]: 

702 """Calculate the formatter and output location to use for put. 

703 

704 Parameters 

705 ---------- 

706 ref : `DatasetRef` 

707 Reference to the associated Dataset. 

708 

709 Returns 

710 ------- 

711 location : `Location` 

712 The location to write the dataset. 

713 formatter : `Formatter` 

714 The `Formatter` to use to write the dataset. 

715 """ 

716 # Work out output file name 

717 try: 

718 template = self.templates.getTemplate(ref) 

719 except KeyError as e: 

720 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

721 

722 # Validate the template to protect against filenames from different 

723 # dataIds returning the same and causing overwrite confusion. 

724 template.validateTemplate(ref) 

725 

726 location = self.locationFactory.fromPath(template.format(ref)) 

727 

728 # Get the formatter based on the storage class 

729 storageClass = ref.datasetType.storageClass 

730 try: 

731 formatter = self.formatterFactory.getFormatter( 

732 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

733 ) 

734 except KeyError as e: 

735 raise DatasetTypeNotSupportedError( 

736 f"Unable to find formatter for {ref} in datastore {self.name}" 

737 ) from e 

738 

739 # Now that we know the formatter, update the location 

740 location = formatter.makeUpdatedLocation(location) 

741 

742 return location, formatter 

743 

744 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

745 # Docstring inherited from base class 

746 if transfer != "auto": 

747 return transfer 

748 

749 # See if the paths are within the datastore or not 

750 inside = [self._pathInStore(d.path) is not None for d in datasets] 

751 

752 if all(inside): 

753 transfer = None 

754 elif not any(inside): 

755 # Allow ResourcePath to use its own knowledge 

756 transfer = "auto" 

757 else: 

758 # This can happen when importing from a datastore that 

759 # has had some datasets ingested using "direct" mode. 

760 # Also allow ResourcePath to sort it out but warn about it. 

761 # This can happen if you are importing from a datastore 

762 # that had some direct transfer datasets. 

763 log.warning( 

764 "Some datasets are inside the datastore and some are outside. Using 'split' " 

765 "transfer mode. This assumes that the files outside the datastore are " 

766 "still accessible to the new butler since they will not be copied into " 

767 "the target datastore." 

768 ) 

769 transfer = "split" 

770 

771 return transfer 

772 

773 def _pathInStore(self, path: ResourcePathExpression) -> str | None: 

774 """Return path relative to datastore root. 

775 

776 Parameters 

777 ---------- 

778 path : `lsst.resources.ResourcePathExpression` 

779 Path to dataset. Can be absolute URI. If relative assumed to 

780 be relative to the datastore. Returns path in datastore 

781 or raises an exception if the path it outside. 

782 

783 Returns 

784 ------- 

785 inStore : `str` 

786 Path relative to datastore root. Returns `None` if the file is 

787 outside the root. 

788 """ 

789 # Relative path will always be relative to datastore 

790 pathUri = ResourcePath(path, forceAbsolute=False) 

791 return pathUri.relative_to(self.root) 

792 

793 def _standardizeIngestPath( 

794 self, path: str | ResourcePath, *, transfer: str | None = None 

795 ) -> str | ResourcePath: 

796 """Standardize the path of a to-be-ingested file. 

797 

798 Parameters 

799 ---------- 

800 path : `str` or `lsst.resources.ResourcePath` 

801 Path of a file to be ingested. This parameter is not expected 

802 to be all the types that can be used to construct a 

803 `~lsst.resources.ResourcePath`. 

804 transfer : `str`, optional 

805 How (and whether) the dataset should be added to the datastore. 

806 See `ingest` for details of transfer modes. 

807 This implementation is provided only so 

808 `NotImplementedError` can be raised if the mode is not supported; 

809 actual transfers are deferred to `_extractIngestInfo`. 

810 

811 Returns 

812 ------- 

813 path : `str` or `lsst.resources.ResourcePath` 

814 New path in what the datastore considers standard form. If an 

815 absolute URI was given that will be returned unchanged. 

816 

817 Notes 

818 ----- 

819 Subclasses of `FileDatastore` can implement this method instead 

820 of `_prepIngest`. It should not modify the data repository or given 

821 file in any way. 

822 

823 Raises 

824 ------ 

825 NotImplementedError 

826 Raised if the datastore does not support the given transfer mode 

827 (including the case where ingest is not supported at all). 

828 FileNotFoundError 

829 Raised if one of the given files does not exist. 

830 """ 

831 if transfer not in (None, "direct", "split") + self.root.transferModes: 

832 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

833 

834 # A relative URI indicates relative to datastore root 

835 srcUri = ResourcePath(path, forceAbsolute=False) 

836 if not srcUri.isabs(): 

837 srcUri = self.root.join(path) 

838 

839 if not srcUri.exists(): 

840 raise FileNotFoundError( 

841 f"Resource at {srcUri} does not exist; note that paths to ingest " 

842 f"are assumed to be relative to {self.root} unless they are absolute." 

843 ) 

844 

845 if transfer is None: 

846 relpath = srcUri.relative_to(self.root) 

847 if not relpath: 

848 raise RuntimeError( 

849 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

850 ) 

851 

852 # Return the relative path within the datastore for internal 

853 # transfer 

854 path = relpath 

855 

856 return path 

857 

858 def _extractIngestInfo( 

859 self, 

860 path: ResourcePathExpression, 

861 ref: DatasetRef, 

862 *, 

863 formatter: Formatter | type[Formatter], 

864 transfer: str | None = None, 

865 record_validation_info: bool = True, 

866 ) -> StoredFileInfo: 

867 """Relocate (if necessary) and extract `StoredFileInfo` from a 

868 to-be-ingested file. 

869 

870 Parameters 

871 ---------- 

872 path : `lsst.resources.ResourcePathExpression` 

873 URI or path of a file to be ingested. 

874 ref : `DatasetRef` 

875 Reference for the dataset being ingested. Guaranteed to have 

876 ``dataset_id not None`. 

877 formatter : `type` or `Formatter` 

878 `Formatter` subclass to use for this dataset or an instance. 

879 transfer : `str`, optional 

880 How (and whether) the dataset should be added to the datastore. 

881 See `ingest` for details of transfer modes. 

882 record_validation_info : `bool`, optional 

883 If `True`, the default, the datastore can record validation 

884 information associated with the file. If `False` the datastore 

885 will not attempt to track any information such as checksums 

886 or file sizes. This can be useful if such information is tracked 

887 in an external system or if the file is to be compressed in place. 

888 It is up to the datastore whether this parameter is relevant. 

889 

890 Returns 

891 ------- 

892 info : `StoredFileInfo` 

893 Internal datastore record for this file. This will be inserted by 

894 the caller; the `_extractIngestInfo` is only responsible for 

895 creating and populating the struct. 

896 

897 Raises 

898 ------ 

899 FileNotFoundError 

900 Raised if one of the given files does not exist. 

901 FileExistsError 

902 Raised if transfer is not `None` but the (internal) location the 

903 file would be moved to is already occupied. 

904 """ 

905 if self._transaction is None: 

906 raise RuntimeError("Ingest called without transaction enabled") 

907 

908 # Create URI of the source path, do not need to force a relative 

909 # path to absolute. 

910 srcUri = ResourcePath(path, forceAbsolute=False) 

911 

912 # Track whether we have read the size of the source yet 

913 have_sized = False 

914 

915 tgtLocation: Location | None 

916 if transfer is None or transfer == "split": 

917 # A relative path is assumed to be relative to the datastore 

918 # in this context 

919 if not srcUri.isabs(): 

920 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

921 else: 

922 # Work out the path in the datastore from an absolute URI 

923 # This is required to be within the datastore. 

924 pathInStore = srcUri.relative_to(self.root) 

925 if pathInStore is None and transfer is None: 

926 raise RuntimeError( 

927 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

928 ) 

929 if pathInStore: 

930 tgtLocation = self.locationFactory.fromPath(pathInStore) 

931 elif transfer == "split": 

932 # Outside the datastore but treat that as a direct ingest 

933 # instead. 

934 tgtLocation = None 

935 else: 

936 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

937 elif transfer == "direct": 

938 # Want to store the full URI to the resource directly in 

939 # datastore. This is useful for referring to permanent archive 

940 # storage for raw data. 

941 # Trust that people know what they are doing. 

942 tgtLocation = None 

943 else: 

944 # Work out the name we want this ingested file to have 

945 # inside the datastore 

946 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

947 if not tgtLocation.uri.dirname().exists(): 

948 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

949 tgtLocation.uri.dirname().mkdir() 

950 

951 # if we are transferring from a local file to a remote location 

952 # it may be more efficient to get the size and checksum of the 

953 # local file rather than the transferred one 

954 if record_validation_info and srcUri.isLocal: 

955 size = srcUri.size() 

956 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

957 have_sized = True 

958 

959 # Transfer the resource to the destination. 

960 # Allow overwrite of an existing file. This matches the behavior 

961 # of datastore.put() in that it trusts that registry would not 

962 # be asking to overwrite unless registry thought that the 

963 # overwrite was allowed. 

964 tgtLocation.uri.transfer_from( 

965 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

966 ) 

967 

968 if tgtLocation is None: 

969 # This means we are using direct mode 

970 targetUri = srcUri 

971 targetPath = str(srcUri) 

972 else: 

973 targetUri = tgtLocation.uri 

974 targetPath = tgtLocation.pathInStore.path 

975 

976 # the file should exist in the datastore now 

977 if record_validation_info: 

978 if not have_sized: 

979 size = targetUri.size() 

980 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

981 else: 

982 # Not recording any file information. 

983 size = -1 

984 checksum = None 

985 

986 return StoredFileInfo( 

987 formatter=formatter, 

988 path=targetPath, 

989 storageClass=ref.datasetType.storageClass, 

990 component=ref.datasetType.component(), 

991 file_size=size, 

992 checksum=checksum, 

993 dataset_id=ref.id, 

994 ) 

995 

996 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> _IngestPrepData: 

997 # Docstring inherited from Datastore._prepIngest. 

998 filtered = [] 

999 for dataset in datasets: 

1000 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1001 if not acceptable: 

1002 continue 

1003 else: 

1004 dataset.refs = acceptable 

1005 if dataset.formatter is None: 

1006 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1007 else: 

1008 assert isinstance(dataset.formatter, (type, str)) 

1009 formatter_class = get_class_of(dataset.formatter) 

1010 if not issubclass(formatter_class, Formatter): 

1011 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1012 dataset.formatter = formatter_class 

1013 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1014 filtered.append(dataset) 

1015 return _IngestPrepData(filtered) 

1016 

1017 @transactional 

1018 def _finishIngest( 

1019 self, 

1020 prepData: Datastore.IngestPrepData, 

1021 *, 

1022 transfer: str | None = None, 

1023 record_validation_info: bool = True, 

1024 ) -> None: 

1025 # Docstring inherited from Datastore._finishIngest. 

1026 refsAndInfos = [] 

1027 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1028 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1029 # Do ingest as if the first dataset ref is associated with the file 

1030 info = self._extractIngestInfo( 

1031 dataset.path, 

1032 dataset.refs[0], 

1033 formatter=dataset.formatter, 

1034 transfer=transfer, 

1035 record_validation_info=record_validation_info, 

1036 ) 

1037 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1038 self._register_datasets(refsAndInfos) 

1039 

1040 def _calculate_ingested_datastore_name( 

1041 self, 

1042 srcUri: ResourcePath, 

1043 ref: DatasetRef, 

1044 formatter: Formatter | type[Formatter] | None = None, 

1045 ) -> Location: 

1046 """Given a source URI and a DatasetRef, determine the name the 

1047 dataset will have inside datastore. 

1048 

1049 Parameters 

1050 ---------- 

1051 srcUri : `lsst.resources.ResourcePath` 

1052 URI to the source dataset file. 

1053 ref : `DatasetRef` 

1054 Ref associated with the newly-ingested dataset artifact. This 

1055 is used to determine the name within the datastore. 

1056 formatter : `Formatter` or Formatter class. 

1057 Formatter to use for validation. Can be a class or an instance. 

1058 No validation of the file extension is performed if the 

1059 ``formatter`` is `None`. This can be used if the caller knows 

1060 that the source URI and target URI will use the same formatter. 

1061 

1062 Returns 

1063 ------- 

1064 location : `Location` 

1065 Target location for the newly-ingested dataset. 

1066 """ 

1067 # Ingesting a file from outside the datastore. 

1068 # This involves a new name. 

1069 template = self.templates.getTemplate(ref) 

1070 location = self.locationFactory.fromPath(template.format(ref)) 

1071 

1072 # Get the extension 

1073 ext = srcUri.getExtension() 

1074 

1075 # Update the destination to include that extension 

1076 location.updateExtension(ext) 

1077 

1078 # Ask the formatter to validate this extension 

1079 if formatter is not None: 

1080 formatter.validateExtension(location) 

1081 

1082 return location 

1083 

1084 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1085 """Write out in memory dataset to datastore. 

1086 

1087 Parameters 

1088 ---------- 

1089 inMemoryDataset : `object` 

1090 Dataset to write to datastore. 

1091 ref : `DatasetRef` 

1092 Registry information associated with this dataset. 

1093 

1094 Returns 

1095 ------- 

1096 info : `StoredFileInfo` 

1097 Information describing the artifact written to the datastore. 

1098 """ 

1099 # May need to coerce the in memory dataset to the correct 

1100 # python type, but first we need to make sure the storage class 

1101 # reflects the one defined in the data repository. 

1102 ref = self._cast_storage_class(ref) 

1103 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1104 

1105 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1106 uri = location.uri 

1107 

1108 if not uri.dirname().exists(): 

1109 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1110 uri.dirname().mkdir() 

1111 

1112 if self._transaction is None: 

1113 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1114 

1115 def _removeFileExists(uri: ResourcePath) -> None: 

1116 """Remove a file and do not complain if it is not there. 

1117 

1118 This is important since a formatter might fail before the file 

1119 is written and we should not confuse people by writing spurious 

1120 error messages to the log. 

1121 """ 

1122 try: 

1123 uri.remove() 

1124 except FileNotFoundError: 

1125 pass 

1126 

1127 # Register a callback to try to delete the uploaded data if 

1128 # something fails below 

1129 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1130 

1131 data_written = False 

1132 if not uri.isLocal: 

1133 # This is a remote URI. Some datasets can be serialized directly 

1134 # to bytes and sent to the remote datastore without writing a 

1135 # file. If the dataset is intended to be saved to the cache 

1136 # a file is always written and direct write to the remote 

1137 # datastore is bypassed. 

1138 if not self.cacheManager.should_be_cached(ref): 

1139 try: 

1140 serializedDataset = formatter.toBytes(inMemoryDataset) 

1141 except NotImplementedError: 

1142 # Fallback to the file writing option. 

1143 pass 

1144 except Exception as e: 

1145 raise RuntimeError( 

1146 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1147 ) from e 

1148 else: 

1149 log.debug("Writing bytes directly to %s", uri) 

1150 uri.write(serializedDataset, overwrite=True) 

1151 log.debug("Successfully wrote bytes directly to %s", uri) 

1152 data_written = True 

1153 

1154 if not data_written: 

1155 # Did not write the bytes directly to object store so instead 

1156 # write to temporary file. Always write to a temporary even if 

1157 # using a local file system -- that gives us atomic writes. 

1158 # If a process is killed as the file is being written we do not 

1159 # want it to remain in the correct place but in corrupt state. 

1160 # For local files write to the output directory not temporary dir. 

1161 prefix = uri.dirname() if uri.isLocal else None 

1162 with ResourcePath.temporary_uri(suffix=uri.getExtension(), prefix=prefix) as temporary_uri: 

1163 # Need to configure the formatter to write to a different 

1164 # location and that needs us to overwrite internals 

1165 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1166 with formatter._updateLocation(Location(None, temporary_uri)): 

1167 try: 

1168 formatter.write(inMemoryDataset) 

1169 except Exception as e: 

1170 raise RuntimeError( 

1171 f"Failed to serialize dataset {ref} of type" 

1172 f" {type(inMemoryDataset)} to " 

1173 f"temporary location {temporary_uri}" 

1174 ) from e 

1175 

1176 # Use move for a local file since that becomes an efficient 

1177 # os.rename. For remote resources we use copy to allow the 

1178 # file to be cached afterwards. 

1179 transfer = "move" if uri.isLocal else "copy" 

1180 

1181 uri.transfer_from(temporary_uri, transfer=transfer, overwrite=True) 

1182 

1183 if transfer == "copy": 

1184 # Cache if required 

1185 self.cacheManager.move_to_cache(temporary_uri, ref) 

1186 

1187 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1188 

1189 # URI is needed to resolve what ingest case are we dealing with 

1190 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1191 

1192 def _read_artifact_into_memory( 

1193 self, 

1194 getInfo: DatastoreFileGetInformation, 

1195 ref: DatasetRef, 

1196 isComponent: bool = False, 

1197 cache_ref: DatasetRef | None = None, 

1198 ) -> Any: 

1199 """Read the artifact from datastore into in memory object. 

1200 

1201 Parameters 

1202 ---------- 

1203 getInfo : `DatastoreFileGetInformation` 

1204 Information about the artifact within the datastore. 

1205 ref : `DatasetRef` 

1206 The registry information associated with this artifact. 

1207 isComponent : `bool` 

1208 Flag to indicate if a component is being read from this artifact. 

1209 cache_ref : `DatasetRef`, optional 

1210 The DatasetRef to use when looking up the file in the cache. 

1211 This ref must have the same ID as the supplied ref but can 

1212 be a parent ref or component ref to indicate to the cache whether 

1213 a composite file is being requested from the cache or a component 

1214 file. Without this the cache will default to the supplied ref but 

1215 it can get confused with read-only derived components for 

1216 disassembled composites. 

1217 

1218 Returns 

1219 ------- 

1220 inMemoryDataset : `object` 

1221 The artifact as a python object. 

1222 """ 

1223 location = getInfo.location 

1224 uri = location.uri 

1225 log.debug("Accessing data from %s", uri) 

1226 

1227 if cache_ref is None: 

1228 cache_ref = ref 

1229 if cache_ref.id != ref.id: 

1230 raise ValueError( 

1231 "The supplied cache dataset ref refers to a different dataset than expected:" 

1232 f" {ref.id} != {cache_ref.id}" 

1233 ) 

1234 

1235 # Cannot recalculate checksum but can compare size as a quick check 

1236 # Do not do this if the size is negative since that indicates 

1237 # we do not know. 

1238 recorded_size = getInfo.info.file_size 

1239 resource_size = uri.size() 

1240 if recorded_size >= 0 and resource_size != recorded_size: 

1241 raise RuntimeError( 

1242 "Integrity failure in Datastore. " 

1243 f"Size of file {uri} ({resource_size}) " 

1244 f"does not match size recorded in registry of {recorded_size}" 

1245 ) 

1246 

1247 # For the general case we have choices for how to proceed. 

1248 # 1. Always use a local file (downloading the remote resource to a 

1249 # temporary file if needed). 

1250 # 2. Use a threshold size and read into memory and use bytes. 

1251 # Use both for now with an arbitrary hand off size. 

1252 # This allows small datasets to be downloaded from remote object 

1253 # stores without requiring a temporary file. 

1254 

1255 formatter = getInfo.formatter 

1256 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1257 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1258 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1259 if cached_file is not None: 

1260 desired_uri = cached_file 

1261 msg = f" (cached version of {uri})" 

1262 else: 

1263 desired_uri = uri 

1264 msg = "" 

1265 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1266 serializedDataset = desired_uri.read() 

1267 log.debug( 

1268 "Deserializing %s from %d bytes from location %s with formatter %s", 

1269 f"component {getInfo.component}" if isComponent else "", 

1270 len(serializedDataset), 

1271 uri, 

1272 formatter.name(), 

1273 ) 

1274 try: 

1275 result = formatter.fromBytes( 

1276 serializedDataset, component=getInfo.component if isComponent else None 

1277 ) 

1278 except Exception as e: 

1279 raise ValueError( 

1280 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1281 f" ({ref.datasetType.name} from {uri}): {e}" 

1282 ) from e 

1283 else: 

1284 # Read from file. 

1285 

1286 # Have to update the Location associated with the formatter 

1287 # because formatter.read does not allow an override. 

1288 # This could be improved. 

1289 location_updated = False 

1290 msg = "" 

1291 

1292 # First check in cache for local version. 

1293 # The cache will only be relevant for remote resources but 

1294 # no harm in always asking. Context manager ensures that cache 

1295 # file is not deleted during cache expiration. 

1296 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1297 if cached_file is not None: 

1298 msg = f"(via cache read of remote file {uri})" 

1299 uri = cached_file 

1300 location_updated = True 

1301 

1302 with uri.as_local() as local_uri: 

1303 can_be_cached = False 

1304 if uri != local_uri: 

1305 # URI was remote and file was downloaded 

1306 cache_msg = "" 

1307 location_updated = True 

1308 

1309 if self.cacheManager.should_be_cached(cache_ref): 

1310 # In this scenario we want to ask if the downloaded 

1311 # file should be cached but we should not cache 

1312 # it until after we've used it (to ensure it can't 

1313 # be expired whilst we are using it). 

1314 can_be_cached = True 

1315 

1316 # Say that it is "likely" to be cached because 

1317 # if the formatter read fails we will not be 

1318 # caching this file. 

1319 cache_msg = " and likely cached" 

1320 

1321 msg = f"(via download to local file{cache_msg})" 

1322 

1323 # Calculate the (possibly) new location for the formatter 

1324 # to use. 

1325 newLocation = Location(*local_uri.split()) if location_updated else None 

1326 

1327 log.debug( 

1328 "Reading%s from location %s %s with formatter %s", 

1329 f" component {getInfo.component}" if isComponent else "", 

1330 uri, 

1331 msg, 

1332 formatter.name(), 

1333 ) 

1334 try: 

1335 with formatter._updateLocation(newLocation): 

1336 with time_this( 

1337 log, 

1338 msg="Reading%s from location %s %s with formatter %s", 

1339 args=( 

1340 f" component {getInfo.component}" if isComponent else "", 

1341 uri, 

1342 msg, 

1343 formatter.name(), 

1344 ), 

1345 ): 

1346 result = formatter.read(component=getInfo.component if isComponent else None) 

1347 except Exception as e: 

1348 raise ValueError( 

1349 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1350 f" ({ref.datasetType.name} from {uri}): {e}" 

1351 ) from e 

1352 

1353 # File was read successfully so can move to cache 

1354 if can_be_cached: 

1355 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1356 

1357 return self._post_process_get( 

1358 result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent 

1359 ) 

1360 

1361 def knows(self, ref: DatasetRef) -> bool: 

1362 """Check if the dataset is known to the datastore. 

1363 

1364 Does not check for existence of any artifact. 

1365 

1366 Parameters 

1367 ---------- 

1368 ref : `DatasetRef` 

1369 Reference to the required dataset. 

1370 

1371 Returns 

1372 ------- 

1373 exists : `bool` 

1374 `True` if the dataset is known to the datastore. 

1375 """ 

1376 fileLocations = self._get_dataset_locations_info(ref) 

1377 if fileLocations: 

1378 return True 

1379 return False 

1380 

1381 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

1382 # Docstring inherited from the base class. 

1383 

1384 # The records themselves. Could be missing some entries. 

1385 records = self._get_stored_records_associated_with_refs(refs) 

1386 

1387 return {ref: ref.id in records for ref in refs} 

1388 

1389 def _process_mexists_records( 

1390 self, 

1391 id_to_ref: dict[DatasetId, DatasetRef], 

1392 records: dict[DatasetId, list[StoredFileInfo]], 

1393 all_required: bool, 

1394 artifact_existence: dict[ResourcePath, bool] | None = None, 

1395 ) -> dict[DatasetRef, bool]: 

1396 """Check given records for existence. 

1397 

1398 Helper function for `mexists()`. 

1399 

1400 Parameters 

1401 ---------- 

1402 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1403 Mapping of the dataset ID to the dataset ref itself. 

1404 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1405 Records as generally returned by 

1406 ``_get_stored_records_associated_with_refs``. 

1407 all_required : `bool` 

1408 Flag to indicate whether existence requires all artifacts 

1409 associated with a dataset ID to exist or not for existence. 

1410 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1411 Optional mapping of datastore artifact to existence. Updated by 

1412 this method with details of all artifacts tested. Can be `None` 

1413 if the caller is not interested. 

1414 

1415 Returns 

1416 ------- 

1417 existence : `dict` of [`DatasetRef`, `bool`] 

1418 Mapping from dataset to boolean indicating existence. 

1419 """ 

1420 # The URIs to be checked and a mapping of those URIs to 

1421 # the dataset ID. 

1422 uris_to_check: list[ResourcePath] = [] 

1423 location_map: dict[ResourcePath, DatasetId] = {} 

1424 

1425 location_factory = self.locationFactory 

1426 

1427 uri_existence: dict[ResourcePath, bool] = {} 

1428 for ref_id, infos in records.items(): 

1429 # Key is the dataset Id, value is list of StoredItemInfo 

1430 uris = [info.file_location(location_factory).uri for info in infos] 

1431 location_map.update({uri: ref_id for uri in uris}) 

1432 

1433 # Check the local cache directly for a dataset corresponding 

1434 # to the remote URI. 

1435 if self.cacheManager.file_count > 0: 

1436 ref = id_to_ref[ref_id] 

1437 for uri, storedFileInfo in zip(uris, infos): 

1438 check_ref = ref 

1439 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1440 check_ref = ref.makeComponentRef(component) 

1441 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1442 # Proxy for URI existence. 

1443 uri_existence[uri] = True 

1444 else: 

1445 uris_to_check.append(uri) 

1446 else: 

1447 # Check all of them. 

1448 uris_to_check.extend(uris) 

1449 

1450 if artifact_existence is not None: 

1451 # If a URI has already been checked remove it from the list 

1452 # and immediately add the status to the output dict. 

1453 filtered_uris_to_check = [] 

1454 for uri in uris_to_check: 

1455 if uri in artifact_existence: 

1456 uri_existence[uri] = artifact_existence[uri] 

1457 else: 

1458 filtered_uris_to_check.append(uri) 

1459 uris_to_check = filtered_uris_to_check 

1460 

1461 # Results. 

1462 dataset_existence: dict[DatasetRef, bool] = {} 

1463 

1464 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1465 for uri, exists in uri_existence.items(): 

1466 dataset_id = location_map[uri] 

1467 ref = id_to_ref[dataset_id] 

1468 

1469 # Disassembled composite needs to check all locations. 

1470 # all_required indicates whether all need to exist or not. 

1471 if ref in dataset_existence: 

1472 if all_required: 

1473 exists = dataset_existence[ref] and exists 

1474 else: 

1475 exists = dataset_existence[ref] or exists 

1476 dataset_existence[ref] = exists 

1477 

1478 if artifact_existence is not None: 

1479 artifact_existence.update(uri_existence) 

1480 

1481 return dataset_existence 

1482 

1483 def mexists( 

1484 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1485 ) -> dict[DatasetRef, bool]: 

1486 """Check the existence of multiple datasets at once. 

1487 

1488 Parameters 

1489 ---------- 

1490 refs : iterable of `DatasetRef` 

1491 The datasets to be checked. 

1492 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1493 Optional mapping of datastore artifact to existence. Updated by 

1494 this method with details of all artifacts tested. Can be `None` 

1495 if the caller is not interested. 

1496 

1497 Returns 

1498 ------- 

1499 existence : `dict` of [`DatasetRef`, `bool`] 

1500 Mapping from dataset to boolean indicating existence. 

1501 

1502 Notes 

1503 ----- 

1504 To minimize potentially costly remote existence checks, the local 

1505 cache is checked as a proxy for existence. If a file for this 

1506 `DatasetRef` does exist no check is done for the actual URI. This 

1507 could result in possibly unexpected behavior if the dataset itself 

1508 has been removed from the datastore by another process whilst it is 

1509 still in the cache. 

1510 """ 

1511 chunk_size = 10_000 

1512 dataset_existence: dict[DatasetRef, bool] = {} 

1513 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1514 n_found_total = 0 

1515 n_checked = 0 

1516 n_chunks = 0 

1517 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1518 chunk_result = self._mexists(chunk, artifact_existence) 

1519 

1520 # The log message level and content depend on how many 

1521 # datasets we are processing. 

1522 n_results = len(chunk_result) 

1523 

1524 # Use verbose logging to ensure that messages can be seen 

1525 # easily if many refs are being checked. 

1526 log_threshold = VERBOSE 

1527 n_checked += n_results 

1528 

1529 # This sum can take some time so only do it if we know the 

1530 # result is going to be used. 

1531 n_found = 0 

1532 if log.isEnabledFor(log_threshold): 

1533 # Can treat the booleans as 0, 1 integers and sum them. 

1534 n_found = sum(chunk_result.values()) 

1535 n_found_total += n_found 

1536 

1537 # We are deliberately not trying to count the number of refs 

1538 # provided in case it's in the millions. This means there is a 

1539 # situation where the number of refs exactly matches the chunk 

1540 # size and we will switch to the multi-chunk path even though 

1541 # we only have a single chunk. 

1542 if n_results < chunk_size and n_chunks == 0: 

1543 # Single chunk will be processed so we can provide more detail. 

1544 if n_results == 1: 

1545 ref = list(chunk_result)[0] 

1546 # Use debug logging to be consistent with `exists()`. 

1547 log.debug( 

1548 "Calling mexists() with single ref that does%s exist (%s).", 

1549 "" if chunk_result[ref] else " not", 

1550 ref, 

1551 ) 

1552 else: 

1553 # Single chunk but multiple files. Summarize. 

1554 log.log( 

1555 log_threshold, 

1556 "Number of datasets found in datastore: %d out of %d datasets checked.", 

1557 n_found, 

1558 n_checked, 

1559 ) 

1560 

1561 else: 

1562 # Use incremental verbose logging when we have multiple chunks. 

1563 log.log( 

1564 log_threshold, 

1565 "Number of datasets found in datastore for chunk %d: %d out of %d checked " 

1566 "(running total from all chunks so far: %d found out of %d checked)", 

1567 n_chunks, 

1568 n_found, 

1569 n_results, 

1570 n_found_total, 

1571 n_checked, 

1572 ) 

1573 dataset_existence.update(chunk_result) 

1574 n_chunks += 1 

1575 

1576 return dataset_existence 

1577 

1578 def _mexists( 

1579 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1580 ) -> dict[DatasetRef, bool]: 

1581 """Check the existence of multiple datasets at once. 

1582 

1583 Parameters 

1584 ---------- 

1585 refs : iterable of `DatasetRef` 

1586 The datasets to be checked. 

1587 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1588 Optional mapping of datastore artifact to existence. Updated by 

1589 this method with details of all artifacts tested. Can be `None` 

1590 if the caller is not interested. 

1591 

1592 Returns 

1593 ------- 

1594 existence : `dict` of [`DatasetRef`, `bool`] 

1595 Mapping from dataset to boolean indicating existence. 

1596 """ 

1597 # Make a mapping from refs with the internal storage class to the given 

1598 # refs that may have a different one. We'll use the internal refs 

1599 # throughout this method and convert back at the very end. 

1600 internal_ref_to_input_ref = {self._cast_storage_class(ref): ref for ref in refs} 

1601 

1602 # Need a mapping of dataset_id to (internal) dataset ref since some 

1603 # internal APIs work with dataset_id. 

1604 id_to_ref = {ref.id: ref for ref in internal_ref_to_input_ref} 

1605 

1606 # Set of all IDs we are checking for. 

1607 requested_ids = set(id_to_ref.keys()) 

1608 

1609 # The records themselves. Could be missing some entries. 

1610 records = self._get_stored_records_associated_with_refs(id_to_ref.values()) 

1611 

1612 dataset_existence = self._process_mexists_records( 

1613 id_to_ref, records, True, artifact_existence=artifact_existence 

1614 ) 

1615 

1616 # Set of IDs that have been handled. 

1617 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1618 

1619 missing_ids = requested_ids - handled_ids 

1620 if missing_ids: 

1621 dataset_existence.update( 

1622 self._mexists_check_expected( 

1623 [id_to_ref[missing] for missing in missing_ids], artifact_existence 

1624 ) 

1625 ) 

1626 

1627 return { 

1628 internal_ref_to_input_ref[internal_ref]: existence 

1629 for internal_ref, existence in dataset_existence.items() 

1630 } 

1631 

1632 def _mexists_check_expected( 

1633 self, refs: Sequence[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

1634 ) -> dict[DatasetRef, bool]: 

1635 """Check existence of refs that are not known to datastore. 

1636 

1637 Parameters 

1638 ---------- 

1639 refs : iterable of `DatasetRef` 

1640 The datasets to be checked. These are assumed not to be known 

1641 to datastore. 

1642 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1643 Optional mapping of datastore artifact to existence. Updated by 

1644 this method with details of all artifacts tested. Can be `None` 

1645 if the caller is not interested. 

1646 

1647 Returns 

1648 ------- 

1649 existence : `dict` of [`DatasetRef`, `bool`] 

1650 Mapping from dataset to boolean indicating existence. 

1651 """ 

1652 dataset_existence: dict[DatasetRef, bool] = {} 

1653 if not self.trustGetRequest: 

1654 # Must assume these do not exist 

1655 for ref in refs: 

1656 dataset_existence[ref] = False 

1657 else: 

1658 log.debug( 

1659 "%d datasets were not known to datastore during initial existence check.", 

1660 len(refs), 

1661 ) 

1662 

1663 # Construct data structure identical to that returned 

1664 # by _get_stored_records_associated_with_refs() but using 

1665 # guessed names. 

1666 records = {} 

1667 id_to_ref = {} 

1668 for missing_ref in refs: 

1669 expected = self._get_expected_dataset_locations_info(missing_ref) 

1670 dataset_id = missing_ref.id 

1671 records[dataset_id] = [info for _, info in expected] 

1672 id_to_ref[dataset_id] = missing_ref 

1673 

1674 dataset_existence.update( 

1675 self._process_mexists_records( 

1676 id_to_ref, 

1677 records, 

1678 False, 

1679 artifact_existence=artifact_existence, 

1680 ) 

1681 ) 

1682 

1683 return dataset_existence 

1684 

1685 def exists(self, ref: DatasetRef) -> bool: 

1686 """Check if the dataset exists in the datastore. 

1687 

1688 Parameters 

1689 ---------- 

1690 ref : `DatasetRef` 

1691 Reference to the required dataset. 

1692 

1693 Returns 

1694 ------- 

1695 exists : `bool` 

1696 `True` if the entity exists in the `Datastore`. 

1697 

1698 Notes 

1699 ----- 

1700 The local cache is checked as a proxy for existence in the remote 

1701 object store. It is possible that another process on a different 

1702 compute node could remove the file from the object store even 

1703 though it is present in the local cache. 

1704 """ 

1705 ref = self._cast_storage_class(ref) 

1706 fileLocations = self._get_dataset_locations_info(ref) 

1707 

1708 # if we are being asked to trust that registry might not be correct 

1709 # we ask for the expected locations and check them explicitly 

1710 if not fileLocations: 

1711 if not self.trustGetRequest: 

1712 return False 

1713 

1714 # First check the cache. If it is not found we must check 

1715 # the datastore itself. Assume that any component in the cache 

1716 # means that the dataset does exist somewhere. 

1717 if self.cacheManager.known_to_cache(ref): 

1718 return True 

1719 

1720 # When we are guessing a dataset location we can not check 

1721 # for the existence of every component since we can not 

1722 # know if every component was written. Instead we check 

1723 # for the existence of any of the expected locations. 

1724 for location, _ in self._get_expected_dataset_locations_info(ref): 

1725 if self._artifact_exists(location): 

1726 return True 

1727 return False 

1728 

1729 # All listed artifacts must exist. 

1730 for location, storedFileInfo in fileLocations: 

1731 # Checking in cache needs the component ref. 

1732 check_ref = ref 

1733 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1734 check_ref = ref.makeComponentRef(component) 

1735 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1736 continue 

1737 

1738 if not self._artifact_exists(location): 

1739 return False 

1740 

1741 return True 

1742 

1743 def getURIs(self, ref: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

1744 """Return URIs associated with dataset. 

1745 

1746 Parameters 

1747 ---------- 

1748 ref : `DatasetRef` 

1749 Reference to the required dataset. 

1750 predict : `bool`, optional 

1751 If the datastore does not know about the dataset, should it 

1752 return a predicted URI or not? 

1753 

1754 Returns 

1755 ------- 

1756 uris : `DatasetRefURIs` 

1757 The URI to the primary artifact associated with this dataset (if 

1758 the dataset was disassembled within the datastore this may be 

1759 `None`), and the URIs to any components associated with the dataset 

1760 artifact. (can be empty if there are no components). 

1761 """ 

1762 many = self.getManyURIs([ref], predict=predict, allow_missing=False) 

1763 return many[ref] 

1764 

1765 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1766 """URI to the Dataset. 

1767 

1768 Parameters 

1769 ---------- 

1770 ref : `DatasetRef` 

1771 Reference to the required Dataset. 

1772 predict : `bool` 

1773 If `True`, allow URIs to be returned of datasets that have not 

1774 been written. 

1775 

1776 Returns 

1777 ------- 

1778 uri : `str` 

1779 URI pointing to the dataset within the datastore. If the 

1780 dataset does not exist in the datastore, and if ``predict`` is 

1781 `True`, the URI will be a prediction and will include a URI 

1782 fragment "#predicted". 

1783 If the datastore does not have entities that relate well 

1784 to the concept of a URI the returned URI will be 

1785 descriptive. The returned URI is not guaranteed to be obtainable. 

1786 

1787 Raises 

1788 ------ 

1789 FileNotFoundError 

1790 Raised if a URI has been requested for a dataset that does not 

1791 exist and guessing is not allowed. 

1792 RuntimeError 

1793 Raised if a request is made for a single URI but multiple URIs 

1794 are associated with this dataset. 

1795 

1796 Notes 

1797 ----- 

1798 When a predicted URI is requested an attempt will be made to form 

1799 a reasonable URI based on file templates and the expected formatter. 

1800 """ 

1801 primary, components = self.getURIs(ref, predict) 

1802 if primary is None or components: 

1803 raise RuntimeError( 

1804 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1805 ) 

1806 return primary 

1807 

1808 def _predict_URIs( 

1809 self, 

1810 ref: DatasetRef, 

1811 ) -> DatasetRefURIs: 

1812 """Predict the URIs of a dataset ref. 

1813 

1814 Parameters 

1815 ---------- 

1816 ref : `DatasetRef` 

1817 Reference to the required Dataset. 

1818 

1819 Returns 

1820 ------- 

1821 URI : DatasetRefUris 

1822 Primary and component URIs. URIs will contain a URI fragment 

1823 "#predicted". 

1824 """ 

1825 uris = DatasetRefURIs() 

1826 

1827 if self.composites.shouldBeDisassembled(ref): 

1828 for component, _ in ref.datasetType.storageClass.components.items(): 

1829 comp_ref = ref.makeComponentRef(component) 

1830 comp_location, _ = self._determine_put_formatter_location(comp_ref) 

1831 

1832 # Add the "#predicted" URI fragment to indicate this is a 

1833 # guess 

1834 uris.componentURIs[component] = ResourcePath(comp_location.uri.geturl() + "#predicted") 

1835 

1836 else: 

1837 location, _ = self._determine_put_formatter_location(ref) 

1838 

1839 # Add the "#predicted" URI fragment to indicate this is a guess 

1840 uris.primaryURI = ResourcePath(location.uri.geturl() + "#predicted") 

1841 

1842 return uris 

1843 

1844 def getManyURIs( 

1845 self, 

1846 refs: Iterable[DatasetRef], 

1847 predict: bool = False, 

1848 allow_missing: bool = False, 

1849 ) -> dict[DatasetRef, DatasetRefURIs]: 

1850 # Docstring inherited 

1851 

1852 uris: dict[DatasetRef, DatasetRefURIs] = {} 

1853 

1854 records = self._get_stored_records_associated_with_refs(refs) 

1855 records_keys = records.keys() 

1856 

1857 existing_refs = tuple(ref for ref in refs if ref.id in records_keys) 

1858 missing_refs = tuple(ref for ref in refs if ref.id not in records_keys) 

1859 

1860 # Have to handle trustGetRequest mode by checking for the existence 

1861 # of the missing refs on disk. 

1862 if missing_refs: 

1863 dataset_existence = self._mexists_check_expected(missing_refs, None) 

1864 really_missing = set() 

1865 not_missing = set() 

1866 for ref, exists in dataset_existence.items(): 

1867 if exists: 

1868 not_missing.add(ref) 

1869 else: 

1870 really_missing.add(ref) 

1871 

1872 if not_missing: 

1873 # Need to recalculate the missing/existing split. 

1874 existing_refs = existing_refs + tuple(not_missing) 

1875 missing_refs = tuple(really_missing) 

1876 

1877 for ref in missing_refs: 

1878 # if this has never been written then we have to guess 

1879 if not predict: 

1880 if not allow_missing: 

1881 raise FileNotFoundError(f"Dataset {ref} not in this datastore.") 

1882 else: 

1883 uris[ref] = self._predict_URIs(ref) 

1884 

1885 for ref in existing_refs: 

1886 file_infos = records[ref.id] 

1887 file_locations = [(i.file_location(self.locationFactory), i) for i in file_infos] 

1888 uris[ref] = self._locations_to_URI(ref, file_locations) 

1889 

1890 return uris 

1891 

1892 def _locations_to_URI( 

1893 self, 

1894 ref: DatasetRef, 

1895 file_locations: Sequence[tuple[Location, StoredFileInfo]], 

1896 ) -> DatasetRefURIs: 

1897 """Convert one or more file locations associated with a DatasetRef 

1898 to a DatasetRefURIs. 

1899 

1900 Parameters 

1901 ---------- 

1902 ref : `DatasetRef` 

1903 Reference to the dataset. 

1904 file_locations : Sequence[Tuple[Location, StoredFileInfo]] 

1905 Each item in the sequence is the location of the dataset within the 

1906 datastore and stored information about the file and its formatter. 

1907 If there is only one item in the sequence then it is treated as the 

1908 primary URI. If there is more than one item then they are treated 

1909 as component URIs. If there are no items then an error is raised 

1910 unless ``self.trustGetRequest`` is `True`. 

1911 

1912 Returns 

1913 ------- 

1914 uris: DatasetRefURIs 

1915 Represents the primary URI or component URIs described by the 

1916 inputs. 

1917 

1918 Raises 

1919 ------ 

1920 RuntimeError 

1921 If no file locations are passed in and ``self.trustGetRequest`` is 

1922 `False`. 

1923 FileNotFoundError 

1924 If the a passed-in URI does not exist, and ``self.trustGetRequest`` 

1925 is `False`. 

1926 RuntimeError 

1927 If a passed in `StoredFileInfo`'s ``component`` is `None` (this is 

1928 unexpected). 

1929 """ 

1930 guessing = False 

1931 uris = DatasetRefURIs() 

1932 

1933 if not file_locations: 

1934 if not self.trustGetRequest: 

1935 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1936 file_locations = self._get_expected_dataset_locations_info(ref) 

1937 guessing = True 

1938 

1939 if len(file_locations) == 1: 

1940 # No disassembly so this is the primary URI 

1941 uris.primaryURI = file_locations[0][0].uri 

1942 if guessing and not uris.primaryURI.exists(): 

1943 raise FileNotFoundError(f"Expected URI ({uris.primaryURI}) does not exist") 

1944 else: 

1945 for location, file_info in file_locations: 

1946 if file_info.component is None: 

1947 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1948 if guessing and not location.uri.exists(): 

1949 # If we are trusting then it is entirely possible for 

1950 # some components to be missing. In that case we skip 

1951 # to the next component. 

1952 if self.trustGetRequest: 

1953 continue 

1954 raise FileNotFoundError(f"Expected URI ({location.uri}) does not exist") 

1955 uris.componentURIs[file_info.component] = location.uri 

1956 

1957 return uris 

1958 

1959 def retrieveArtifacts( 

1960 self, 

1961 refs: Iterable[DatasetRef], 

1962 destination: ResourcePath, 

1963 transfer: str = "auto", 

1964 preserve_path: bool = True, 

1965 overwrite: bool = False, 

1966 ) -> list[ResourcePath]: 

1967 """Retrieve the file artifacts associated with the supplied refs. 

1968 

1969 Parameters 

1970 ---------- 

1971 refs : iterable of `DatasetRef` 

1972 The datasets for which file artifacts are to be retrieved. 

1973 A single ref can result in multiple files. The refs must 

1974 be resolved. 

1975 destination : `lsst.resources.ResourcePath` 

1976 Location to write the file artifacts. 

1977 transfer : `str`, optional 

1978 Method to use to transfer the artifacts. Must be one of the options 

1979 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1980 "move" is not allowed. 

1981 preserve_path : `bool`, optional 

1982 If `True` the full path of the file artifact within the datastore 

1983 is preserved. If `False` the final file component of the path 

1984 is used. 

1985 overwrite : `bool`, optional 

1986 If `True` allow transfers to overwrite existing files at the 

1987 destination. 

1988 

1989 Returns 

1990 ------- 

1991 targets : `list` of `lsst.resources.ResourcePath` 

1992 URIs of file artifacts in destination location. Order is not 

1993 preserved. 

1994 """ 

1995 if not destination.isdir(): 

1996 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1997 

1998 if transfer == "move": 

1999 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

2000 

2001 # Source -> Destination 

2002 # This also helps filter out duplicate DatasetRef in the request 

2003 # that will map to the same underlying file transfer. 

2004 to_transfer: dict[ResourcePath, ResourcePath] = {} 

2005 

2006 for ref in refs: 

2007 locations = self._get_dataset_locations_info(ref) 

2008 for location, _ in locations: 

2009 source_uri = location.uri 

2010 target_path: ResourcePathExpression 

2011 if preserve_path: 

2012 target_path = location.pathInStore 

2013 if target_path.isabs(): 

2014 # This is an absolute path to an external file. 

2015 # Use the full path. 

2016 target_path = target_path.relativeToPathRoot 

2017 else: 

2018 target_path = source_uri.basename() 

2019 target_uri = destination.join(target_path) 

2020 to_transfer[source_uri] = target_uri 

2021 

2022 # In theory can now parallelize the transfer 

2023 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

2024 for source_uri, target_uri in to_transfer.items(): 

2025 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

2026 

2027 return list(to_transfer.values()) 

2028 

2029 def get( 

2030 self, 

2031 ref: DatasetRef, 

2032 parameters: Mapping[str, Any] | None = None, 

2033 storageClass: StorageClass | str | None = None, 

2034 ) -> Any: 

2035 """Load an InMemoryDataset from the store. 

2036 

2037 Parameters 

2038 ---------- 

2039 ref : `DatasetRef` 

2040 Reference to the required Dataset. 

2041 parameters : `dict` 

2042 `StorageClass`-specific parameters that specify, for example, 

2043 a slice of the dataset to be loaded. 

2044 storageClass : `StorageClass` or `str`, optional 

2045 The storage class to be used to override the Python type 

2046 returned by this method. By default the returned type matches 

2047 the dataset type definition for this dataset. Specifying a 

2048 read `StorageClass` can force a different type to be returned. 

2049 This type must be compatible with the original type. 

2050 

2051 Returns 

2052 ------- 

2053 inMemoryDataset : `object` 

2054 Requested dataset or slice thereof as an InMemoryDataset. 

2055 

2056 Raises 

2057 ------ 

2058 FileNotFoundError 

2059 Requested dataset can not be retrieved. 

2060 TypeError 

2061 Return value from formatter has unexpected type. 

2062 ValueError 

2063 Formatter failed to process the dataset. 

2064 """ 

2065 # Supplied storage class for the component being read is either 

2066 # from the ref itself or some an override if we want to force 

2067 # type conversion. 

2068 if storageClass is not None: 

2069 ref = ref.overrideStorageClass(storageClass) 

2070 refStorageClass = ref.datasetType.storageClass 

2071 

2072 allGetInfo = self._prepare_for_get(ref, parameters) 

2073 refComponent = ref.datasetType.component() 

2074 

2075 # Create mapping from component name to related info 

2076 allComponents = {i.component: i for i in allGetInfo} 

2077 

2078 # By definition the dataset is disassembled if we have more 

2079 # than one record for it. 

2080 isDisassembled = len(allGetInfo) > 1 

2081 

2082 # Look for the special case where we are disassembled but the 

2083 # component is a derived component that was not written during 

2084 # disassembly. For this scenario we need to check that the 

2085 # component requested is listed as a derived component for the 

2086 # composite storage class 

2087 isDisassembledReadOnlyComponent = False 

2088 if isDisassembled and refComponent: 

2089 # The composite storage class should be accessible through 

2090 # the component dataset type 

2091 compositeStorageClass = ref.datasetType.parentStorageClass 

2092 

2093 # In the unlikely scenario where the composite storage 

2094 # class is not known, we can only assume that this is a 

2095 # normal component. If that assumption is wrong then the 

2096 # branch below that reads a persisted component will fail 

2097 # so there is no need to complain here. 

2098 if compositeStorageClass is not None: 

2099 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

2100 

2101 if isDisassembled and not refComponent: 

2102 # This was a disassembled dataset spread over multiple files 

2103 # and we need to put them all back together again. 

2104 # Read into memory and then assemble 

2105 

2106 # Check that the supplied parameters are suitable for the type read 

2107 refStorageClass.validateParameters(parameters) 

2108 

2109 # We want to keep track of all the parameters that were not used 

2110 # by formatters. We assume that if any of the component formatters 

2111 # use a parameter that we do not need to apply it again in the 

2112 # assembler. 

2113 usedParams = set() 

2114 

2115 components: dict[str, Any] = {} 

2116 for getInfo in allGetInfo: 

2117 # assemblerParams are parameters not understood by the 

2118 # associated formatter. 

2119 usedParams.update(set(getInfo.formatterParams)) 

2120 

2121 component = getInfo.component 

2122 

2123 if component is None: 

2124 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

2125 

2126 # We do not want the formatter to think it's reading 

2127 # a component though because it is really reading a 

2128 # standalone dataset -- always tell reader it is not a 

2129 # component. 

2130 components[component] = self._read_artifact_into_memory( 

2131 getInfo, ref.makeComponentRef(component), isComponent=False 

2132 ) 

2133 

2134 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

2135 

2136 # Any unused parameters will have to be passed to the assembler 

2137 if parameters: 

2138 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

2139 else: 

2140 unusedParams = {} 

2141 

2142 # Process parameters 

2143 return ref.datasetType.storageClass.delegate().handleParameters( 

2144 inMemoryDataset, parameters=unusedParams 

2145 ) 

2146 

2147 elif isDisassembledReadOnlyComponent: 

2148 compositeStorageClass = ref.datasetType.parentStorageClass 

2149 if compositeStorageClass is None: 

2150 raise RuntimeError( 

2151 f"Unable to retrieve derived component '{refComponent}' since" 

2152 "no composite storage class is available." 

2153 ) 

2154 

2155 if refComponent is None: 

2156 # Mainly for mypy 

2157 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

2158 

2159 # Assume that every derived component can be calculated by 

2160 # forwarding the request to a single read/write component. 

2161 # Rather than guessing which rw component is the right one by 

2162 # scanning each for a derived component of the same name, 

2163 # we ask the storage class delegate directly which one is best to 

2164 # use. 

2165 compositeDelegate = compositeStorageClass.delegate() 

2166 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

2167 refComponent, set(allComponents) 

2168 ) 

2169 

2170 # Select the relevant component 

2171 rwInfo = allComponents[forwardedComponent] 

2172 

2173 # For now assume that read parameters are validated against 

2174 # the real component and not the requested component 

2175 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

2176 forwardedStorageClass.validateParameters(parameters) 

2177 

2178 # The reference to use for the caching must refer to the forwarded 

2179 # component and not the derived component. 

2180 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

2181 

2182 # Unfortunately the FileDescriptor inside the formatter will have 

2183 # the wrong write storage class so we need to create a new one 

2184 # given the immutability constraint. 

2185 writeStorageClass = rwInfo.info.storageClass 

2186 

2187 # We may need to put some thought into parameters for read 

2188 # components but for now forward them on as is 

2189 readFormatter = type(rwInfo.formatter)( 

2190 FileDescriptor( 

2191 rwInfo.location, 

2192 readStorageClass=refStorageClass, 

2193 storageClass=writeStorageClass, 

2194 parameters=parameters, 

2195 ), 

2196 ref.dataId, 

2197 ) 

2198 

2199 # The assembler can not receive any parameter requests for a 

2200 # derived component at this time since the assembler will 

2201 # see the storage class of the derived component and those 

2202 # parameters will have to be handled by the formatter on the 

2203 # forwarded storage class. 

2204 assemblerParams: dict[str, Any] = {} 

2205 

2206 # Need to created a new info that specifies the derived 

2207 # component and associated storage class 

2208 readInfo = DatastoreFileGetInformation( 

2209 rwInfo.location, 

2210 readFormatter, 

2211 rwInfo.info, 

2212 assemblerParams, 

2213 {}, 

2214 refComponent, 

2215 refStorageClass, 

2216 ) 

2217 

2218 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2219 

2220 else: 

2221 # Single file request or component from that composite file 

2222 for lookup in (refComponent, None): 

2223 if lookup in allComponents: 

2224 getInfo = allComponents[lookup] 

2225 break 

2226 else: 

2227 raise FileNotFoundError( 

2228 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2229 ) 

2230 

2231 # Do not need the component itself if already disassembled 

2232 if isDisassembled: 

2233 isComponent = False 

2234 else: 

2235 isComponent = getInfo.component is not None 

2236 

2237 # For a component read of a composite we want the cache to 

2238 # be looking at the composite ref itself. 

2239 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2240 

2241 # For a disassembled component we can validate parametersagainst 

2242 # the component storage class directly 

2243 if isDisassembled: 

2244 refStorageClass.validateParameters(parameters) 

2245 else: 

2246 # For an assembled composite this could be a derived 

2247 # component derived from a real component. The validity 

2248 # of the parameters is not clear. For now validate against 

2249 # the composite storage class 

2250 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2251 

2252 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2253 

2254 @transactional 

2255 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2256 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2257 

2258 Parameters 

2259 ---------- 

2260 inMemoryDataset : `object` 

2261 The dataset to store. 

2262 ref : `DatasetRef` 

2263 Reference to the associated Dataset. 

2264 

2265 Raises 

2266 ------ 

2267 TypeError 

2268 Supplied object and storage class are inconsistent. 

2269 DatasetTypeNotSupportedError 

2270 The associated `DatasetType` is not handled by this datastore. 

2271 

2272 Notes 

2273 ----- 

2274 If the datastore is configured to reject certain dataset types it 

2275 is possible that the put will fail and raise a 

2276 `DatasetTypeNotSupportedError`. The main use case for this is to 

2277 allow `ChainedDatastore` to put to multiple datastores without 

2278 requiring that every datastore accepts the dataset. 

2279 """ 

2280 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2281 # doDisassembly = True 

2282 

2283 artifacts = [] 

2284 if doDisassembly: 

2285 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2286 if components is None: 

2287 raise RuntimeError( 

2288 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2289 f"with storage class {ref.datasetType.storageClass.name} " 

2290 "is configured to be disassembled, but cannot be." 

2291 ) 

2292 for component, componentInfo in components.items(): 

2293 # Don't recurse because we want to take advantage of 

2294 # bulk insert -- need a new DatasetRef that refers to the 

2295 # same dataset_id but has the component DatasetType 

2296 # DatasetType does not refer to the types of components 

2297 # So we construct one ourselves. 

2298 compRef = ref.makeComponentRef(component) 

2299 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2300 artifacts.append((compRef, storedInfo)) 

2301 else: 

2302 # Write the entire thing out 

2303 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2304 artifacts.append((ref, storedInfo)) 

2305 

2306 self._register_datasets(artifacts) 

2307 

2308 @transactional 

2309 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

2310 # At this point can safely remove these datasets from the cache 

2311 # to avoid confusion later on. If they are not trashed later 

2312 # the cache will simply be refilled. 

2313 self.cacheManager.remove_from_cache(ref) 

2314 

2315 # If we are in trust mode there will be nothing to move to 

2316 # the trash table and we will have to try to delete the file 

2317 # immediately. 

2318 if self.trustGetRequest: 

2319 # Try to keep the logic below for a single file trash. 

2320 if isinstance(ref, DatasetRef): 

2321 refs = {ref} 

2322 else: 

2323 # Will recreate ref at the end of this branch. 

2324 refs = set(ref) 

2325 

2326 # Determine which datasets are known to datastore directly. 

2327 id_to_ref = {ref.id: ref for ref in refs} 

2328 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2329 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2330 

2331 missing = refs - existing_refs 

2332 if missing: 

2333 # Do an explicit existence check on these refs. 

2334 # We only care about the artifacts at this point and not 

2335 # the dataset existence. 

2336 artifact_existence: dict[ResourcePath, bool] = {} 

2337 _ = self.mexists(missing, artifact_existence) 

2338 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2339 

2340 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2341 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2342 for uri in uris: 

2343 try: 

2344 uri.remove() 

2345 except Exception as e: 

2346 if ignore_errors: 

2347 log.debug("Artifact %s could not be removed: %s", uri, e) 

2348 continue 

2349 raise 

2350 

2351 # There is no point asking the code below to remove refs we 

2352 # know are missing so update it with the list of existing 

2353 # records. Try to retain one vs many logic. 

2354 if not existing_refs: 

2355 # Nothing more to do since none of the datasets were 

2356 # known to the datastore record table. 

2357 return 

2358 ref = list(existing_refs) 

2359 if len(ref) == 1: 

2360 ref = ref[0] 

2361 

2362 # Get file metadata and internal metadata 

2363 if not isinstance(ref, DatasetRef): 

2364 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2365 # Assumed to be an iterable of refs so bulk mode enabled. 

2366 try: 

2367 self.bridge.moveToTrash(ref, transaction=self._transaction) 

2368 except Exception as e: 

2369 if ignore_errors: 

2370 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2371 else: 

2372 raise 

2373 return 

2374 

2375 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2376 

2377 fileLocations = self._get_dataset_locations_info(ref) 

2378 

2379 if not fileLocations: 

2380 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2381 if ignore_errors: 

2382 log.warning(err_msg) 

2383 return 

2384 else: 

2385 raise FileNotFoundError(err_msg) 

2386 

2387 for location, storedFileInfo in fileLocations: 

2388 if not self._artifact_exists(location): 

2389 err_msg = ( 

2390 f"Dataset is known to datastore {self.name} but " 

2391 f"associated artifact ({location.uri}) is missing" 

2392 ) 

2393 if ignore_errors: 

2394 log.warning(err_msg) 

2395 return 

2396 else: 

2397 raise FileNotFoundError(err_msg) 

2398 

2399 # Mark dataset as trashed 

2400 try: 

2401 self.bridge.moveToTrash([ref], transaction=self._transaction) 

2402 except Exception as e: 

2403 if ignore_errors: 

2404 log.warning( 

2405 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2406 "but encountered an error: %s", 

2407 ref, 

2408 self.name, 

2409 e, 

2410 ) 

2411 pass 

2412 else: 

2413 raise 

2414 

2415 @transactional 

2416 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2417 """Remove all datasets from the trash. 

2418 

2419 Parameters 

2420 ---------- 

2421 ignore_errors : `bool` 

2422 If `True` return without error even if something went wrong. 

2423 Problems could occur if another process is simultaneously trying 

2424 to delete. 

2425 """ 

2426 log.debug("Emptying trash in datastore %s", self.name) 

2427 

2428 # Context manager will empty trash iff we finish it without raising. 

2429 # It will also automatically delete the relevant rows from the 

2430 # trash table and the records table. 

2431 with self.bridge.emptyTrash( 

2432 self._table, record_class=StoredFileInfo, record_column="path" 

2433 ) as trash_data: 

2434 # Removing the artifacts themselves requires that the files are 

2435 # not also associated with refs that are not to be trashed. 

2436 # Therefore need to do a query with the file paths themselves 

2437 # and return all the refs associated with them. Can only delete 

2438 # a file if the refs to be trashed are the only refs associated 

2439 # with the file. 

2440 # This requires multiple copies of the trashed items 

2441 trashed, artifacts_to_keep = trash_data 

2442 

2443 if artifacts_to_keep is None: 

2444 # The bridge is not helping us so have to work it out 

2445 # ourselves. This is not going to be as efficient. 

2446 trashed = list(trashed) 

2447 

2448 # The instance check is for mypy since up to this point it 

2449 # does not know the type of info. 

2450 path_map = self._refs_associated_with_artifacts( 

2451 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2452 ) 

2453 

2454 for ref, info in trashed: 

2455 # Mypy needs to know this is not the base class 

2456 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2457 

2458 path_map[info.path].remove(ref.id) 

2459 if not path_map[info.path]: 

2460 del path_map[info.path] 

2461 

2462 artifacts_to_keep = set(path_map) 

2463 

2464 for ref, info in trashed: 

2465 # Should not happen for this implementation but need 

2466 # to keep mypy happy. 

2467 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2468 

2469 # Mypy needs to know this is not the base class 

2470 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2471 

2472 if info.path in artifacts_to_keep: 

2473 # This is a multi-dataset artifact and we are not 

2474 # removing all associated refs. 

2475 continue 

2476 

2477 # Only trashed refs still known to datastore will be returned. 

2478 location = info.file_location(self.locationFactory) 

2479 

2480 # Point of no return for this artifact 

2481 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2482 try: 

2483 self._delete_artifact(location) 

2484 except FileNotFoundError: 

2485 # If the file itself has been deleted there is nothing 

2486 # we can do about it. It is possible that trash has 

2487 # been run in parallel in another process or someone 

2488 # decided to delete the file. It is unlikely to come 

2489 # back and so we should still continue with the removal 

2490 # of the entry from the trash table. It is also possible 

2491 # we removed it in a previous iteration if it was 

2492 # a multi-dataset artifact. The delete artifact method 

2493 # will log a debug message in this scenario. 

2494 # Distinguishing file missing before trash started and 

2495 # file already removed previously as part of this trash 

2496 # is not worth the distinction with regards to potential 

2497 # memory cost. 

2498 pass 

2499 except Exception as e: 

2500 if ignore_errors: 

2501 # Use a debug message here even though it's not 

2502 # a good situation. In some cases this can be 

2503 # caused by a race between user A and user B 

2504 # and neither of them has permissions for the 

2505 # other's files. Butler does not know about users 

2506 # and trash has no idea what collections these 

2507 # files were in (without guessing from a path). 

2508 log.debug( 

2509 "Encountered error removing artifact %s from datastore %s: %s", 

2510 location.uri, 

2511 self.name, 

2512 e, 

2513 ) 

2514 else: 

2515 raise 

2516 

2517 @transactional 

2518 def transfer_from( 

2519 self, 

2520 source_datastore: Datastore, 

2521 refs: Iterable[DatasetRef], 

2522 transfer: str = "auto", 

2523 artifact_existence: dict[ResourcePath, bool] | None = None, 

2524 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

2525 # Docstring inherited 

2526 if type(self) is not type(source_datastore): 

2527 raise TypeError( 

2528 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2529 f"source datastore ({type(source_datastore)})." 

2530 ) 

2531 

2532 # Be explicit for mypy 

2533 if not isinstance(source_datastore, FileDatastore): 

2534 raise TypeError( 

2535 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2536 f" {type(source_datastore)}" 

2537 ) 

2538 

2539 # Stop early if "direct" transfer mode is requested. That would 

2540 # require that the URI inside the source datastore should be stored 

2541 # directly in the target datastore, which seems unlikely to be useful 

2542 # since at any moment the source datastore could delete the file. 

2543 if transfer in ("direct", "split"): 

2544 raise ValueError( 

2545 f"Can not transfer from a source datastore using {transfer} mode since" 

2546 " those files are controlled by the other datastore." 

2547 ) 

2548 

2549 # Empty existence lookup if none given. 

2550 if artifact_existence is None: 

2551 artifact_existence = {} 

2552 

2553 # We will go through the list multiple times so must convert 

2554 # generators to lists. 

2555 refs = list(refs) 

2556 

2557 # In order to handle disassembled composites the code works 

2558 # at the records level since it can assume that internal APIs 

2559 # can be used. 

2560 # - If the record already exists in the destination this is assumed 

2561 # to be okay. 

2562 # - If there is no record but the source and destination URIs are 

2563 # identical no transfer is done but the record is added. 

2564 # - If the source record refers to an absolute URI currently assume 

2565 # that that URI should remain absolute and will be visible to the 

2566 # destination butler. May need to have a flag to indicate whether 

2567 # the dataset should be transferred. This will only happen if 

2568 # the detached Butler has had a local ingest. 

2569 

2570 # What we really want is all the records in the source datastore 

2571 # associated with these refs. Or derived ones if they don't exist 

2572 # in the source. 

2573 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2574 

2575 # The source dataset_ids are the keys in these records 

2576 source_ids = set(source_records) 

2577 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2578 

2579 requested_ids = {ref.id for ref in refs} 

2580 missing_ids = requested_ids - source_ids 

2581 

2582 # Missing IDs can be okay if that datastore has allowed 

2583 # gets based on file existence. Should we transfer what we can 

2584 # or complain about it and warn? 

2585 if missing_ids and not source_datastore.trustGetRequest: 

2586 raise ValueError( 

2587 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2588 ) 

2589 

2590 # Need to map these missing IDs to a DatasetRef so we can guess 

2591 # the details. 

2592 if missing_ids: 

2593 log.info( 

2594 "Number of expected datasets missing from source datastore records: %d out of %d", 

2595 len(missing_ids), 

2596 len(requested_ids), 

2597 ) 

2598 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2599 

2600 # This should be chunked in case we end up having to check 

2601 # the file store since we need some log output to show 

2602 # progress. 

2603 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2604 records = {} 

2605 for missing in missing_ids_chunk: 

2606 # Ask the source datastore where the missing artifacts 

2607 # should be. An execution butler might not know about the 

2608 # artifacts even if they are there. 

2609 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2610 records[missing] = [info for _, info in expected] 

2611 

2612 # Call the mexist helper method in case we have not already 

2613 # checked these artifacts such that artifact_existence is 

2614 # empty. This allows us to benefit from parallelism. 

2615 # datastore.mexists() itself does not give us access to the 

2616 # derived datastore record. 

2617 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2618 ref_exists = source_datastore._process_mexists_records( 

2619 id_to_ref, records, False, artifact_existence=artifact_existence 

2620 ) 

2621 

2622 # Now go through the records and propagate the ones that exist. 

2623 location_factory = source_datastore.locationFactory 

2624 for missing, record_list in records.items(): 

2625 # Skip completely if the ref does not exist. 

2626 ref = id_to_ref[missing] 

2627 if not ref_exists[ref]: 

2628 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2629 continue 

2630 # Check for file artifact to decide which parts of a 

2631 # disassembled composite do exist. If there is only a 

2632 # single record we don't even need to look because it can't 

2633 # be a composite and must exist. 

2634 if len(record_list) == 1: 

2635 dataset_records = record_list 

2636 else: 

2637 dataset_records = [ 

2638 record 

2639 for record in record_list 

2640 if artifact_existence[record.file_location(location_factory).uri] 

2641 ] 

2642 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2643 

2644 # Rely on source_records being a defaultdict. 

2645 source_records[missing].extend(dataset_records) 

2646 

2647 # See if we already have these records 

2648 target_records = self._get_stored_records_associated_with_refs(refs) 

2649 

2650 # The artifacts to register 

2651 artifacts = [] 

2652 

2653 # Refs that already exist 

2654 already_present = [] 

2655 

2656 # Refs that were rejected by this datastore. 

2657 rejected = set() 

2658 

2659 # Refs that were transferred successfully. 

2660 accepted = set() 

2661 

2662 # Record each time we have done a "direct" transfer. 

2663 direct_transfers = [] 

2664 

2665 # Now can transfer the artifacts 

2666 for ref in refs: 

2667 if not self.constraints.isAcceptable(ref): 

2668 # This datastore should not be accepting this dataset. 

2669 rejected.add(ref) 

2670 continue 

2671 

2672 accepted.add(ref) 

2673 

2674 if ref.id in target_records: 

2675 # Already have an artifact for this. 

2676 already_present.append(ref) 

2677 continue 

2678 

2679 # mypy needs to know these are always resolved refs 

2680 for info in source_records[ref.id]: 

2681 source_location = info.file_location(source_datastore.locationFactory) 

2682 target_location = info.file_location(self.locationFactory) 

2683 if source_location == target_location and not source_location.pathInStore.isabs(): 

2684 # Artifact is already in the target location. 

2685 # (which is how execution butler currently runs) 

2686 pass 

2687 else: 

2688 if target_location.pathInStore.isabs(): 

2689 # Just because we can see the artifact when running 

2690 # the transfer doesn't mean it will be generally 

2691 # accessible to a user of this butler. Need to decide 

2692 # what to do about an absolute path. 

2693 if transfer == "auto": 

2694 # For "auto" transfers we allow the absolute URI 

2695 # to be recorded in the target datastore. 

2696 direct_transfers.append(source_location) 

2697 else: 

2698 # The user is explicitly requesting a transfer 

2699 # even for an absolute URI. This requires us to 

2700 # calculate the target path. 

2701 template_ref = ref 

2702 if info.component: 

2703 template_ref = ref.makeComponentRef(info.component) 

2704 target_location = self._calculate_ingested_datastore_name( 

2705 source_location.uri, 

2706 template_ref, 

2707 ) 

2708 

2709 info = info.update(path=target_location.pathInStore.path) 

2710 

2711 # Need to transfer it to the new location. 

2712 # Assume we should always overwrite. If the artifact 

2713 # is there this might indicate that a previous transfer 

2714 # was interrupted but was not able to be rolled back 

2715 # completely (eg pre-emption) so follow Datastore default 

2716 # and overwrite. 

2717 target_location.uri.transfer_from( 

2718 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2719 ) 

2720 

2721 artifacts.append((ref, info)) 

2722 

2723 if direct_transfers: 

2724 log.info( 

2725 "Transfer request for an outside-datastore artifact with absolute URI done %d time%s", 

2726 len(direct_transfers), 

2727 "" if len(direct_transfers) == 1 else "s", 

2728 ) 

2729 

2730 self._register_datasets(artifacts) 

2731 

2732 if already_present: 

2733 n_skipped = len(already_present) 

2734 log.info( 

2735 "Skipped transfer of %d dataset%s already present in datastore", 

2736 n_skipped, 

2737 "" if n_skipped == 1 else "s", 

2738 ) 

2739 

2740 return accepted, rejected 

2741 

2742 @transactional 

2743 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2744 # Docstring inherited. 

2745 refs = list(refs) 

2746 self.bridge.forget(refs) 

2747 self._table.delete(["dataset_id"], *[{"dataset_id": ref.id} for ref in refs]) 

2748 

2749 def validateConfiguration( 

2750 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

2751 ) -> None: 

2752 """Validate some of the configuration for this datastore. 

2753 

2754 Parameters 

2755 ---------- 

2756 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2757 Entities to test against this configuration. Can be differing 

2758 types. 

2759 logFailures : `bool`, optional 

2760 If `True`, output a log message for every validation error 

2761 detected. 

2762 

2763 Raises 

2764 ------ 

2765 DatastoreValidationError 

2766 Raised if there is a validation problem with a configuration. 

2767 All the problems are reported in a single exception. 

2768 

2769 Notes 

2770 ----- 

2771 This method checks that all the supplied entities have valid file 

2772 templates and also have formatters defined. 

2773 """ 

2774 templateFailed = None 

2775 try: 

2776 self.templates.validateTemplates(entities, logFailures=logFailures) 

2777 except FileTemplateValidationError as e: 

2778 templateFailed = str(e) 

2779 

2780 formatterFailed = [] 

2781 for entity in entities: 

2782 try: 

2783 self.formatterFactory.getFormatterClass(entity) 

2784 except KeyError as e: 

2785 formatterFailed.append(str(e)) 

2786 if logFailures: 

2787 log.critical("Formatter failure: %s", e) 

2788 

2789 if templateFailed or formatterFailed: 

2790 messages = [] 

2791 if templateFailed: 

2792 messages.append(templateFailed) 

2793 if formatterFailed: 

2794 messages.append(",".join(formatterFailed)) 

2795 msg = ";\n".join(messages) 

2796 raise DatastoreValidationError(msg) 

2797 

2798 def getLookupKeys(self) -> set[LookupKey]: 

2799 # Docstring is inherited from base class 

2800 return ( 

2801 self.templates.getLookupKeys() 

2802 | self.formatterFactory.getLookupKeys() 

2803 | self.constraints.getLookupKeys() 

2804 ) 

2805 

2806 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

2807 # Docstring is inherited from base class 

2808 # The key can be valid in either formatters or templates so we can 

2809 # only check the template if it exists 

2810 if lookupKey in self.templates: 

2811 try: 

2812 self.templates[lookupKey].validateTemplate(entity) 

2813 except FileTemplateValidationError as e: 

2814 raise DatastoreValidationError(e) from e 

2815 

2816 def export( 

2817 self, 

2818 refs: Iterable[DatasetRef], 

2819 *, 

2820 directory: ResourcePathExpression | None = None, 

2821 transfer: str | None = "auto", 

2822 ) -> Iterable[FileDataset]: 

2823 # Docstring inherited from Datastore.export. 

2824 if transfer == "auto" and directory is None: 

2825 transfer = None 

2826 

2827 if transfer is not None and directory is None: 

2828 raise TypeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2829 

2830 if transfer == "move": 

2831 raise TypeError("Can not export by moving files out of datastore.") 

2832 elif transfer == "direct": 

2833 # For an export, treat this as equivalent to None. We do not 

2834 # want an import to risk using absolute URIs to datasets owned 

2835 # by another datastore. 

2836 log.info("Treating 'direct' transfer mode as in-place export.") 

2837 transfer = None 

2838 

2839 # Force the directory to be a URI object 

2840 directoryUri: ResourcePath | None = None 

2841 if directory is not None: 

2842 directoryUri = ResourcePath(directory, forceDirectory=True) 

2843 

2844 if transfer is not None and directoryUri is not None: 

2845 # mypy needs the second test 

2846 if not directoryUri.exists(): 

2847 raise FileNotFoundError(f"Export location {directory} does not exist") 

2848 

2849 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2850 for ref in progress.wrap(refs, "Exporting dataset files"): 

2851 fileLocations = self._get_dataset_locations_info(ref) 

2852 if not fileLocations: 

2853 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2854 # For now we can not export disassembled datasets 

2855 if len(fileLocations) > 1: 

2856 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2857 location, storedFileInfo = fileLocations[0] 

2858 

2859 pathInStore = location.pathInStore.path 

2860 if transfer is None: 

2861 # TODO: do we also need to return the readStorageClass somehow? 

2862 # We will use the path in store directly. If this is an 

2863 # absolute URI, preserve it. 

2864 if location.pathInStore.isabs(): 

2865 pathInStore = str(location.uri) 

2866 elif transfer == "direct": 

2867 # Use full URIs to the remote store in the export 

2868 pathInStore = str(location.uri) 

2869 else: 

2870 # mypy needs help 

2871 assert directoryUri is not None, "directoryUri must be defined to get here" 

2872 storeUri = ResourcePath(location.uri) 

2873 

2874 # if the datastore has an absolute URI to a resource, we 

2875 # have two options: 

2876 # 1. Keep the absolute URI in the exported YAML 

2877 # 2. Allocate a new name in the local datastore and transfer 

2878 # it. 

2879 # For now go with option 2 

2880 if location.pathInStore.isabs(): 

2881 template = self.templates.getTemplate(ref) 

2882 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2883 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2884 

2885 exportUri = directoryUri.join(pathInStore) 

2886 exportUri.transfer_from(storeUri, transfer=transfer) 

2887 

2888 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2889 

2890 @staticmethod 

2891 def computeChecksum(uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192) -> str | None: 

2892 """Compute the checksum of the supplied file. 

2893 

2894 Parameters 

2895 ---------- 

2896 uri : `lsst.resources.ResourcePath` 

2897 Name of resource to calculate checksum from. 

2898 algorithm : `str`, optional 

2899 Name of algorithm to use. Must be one of the algorithms supported 

2900 by :py:class`hashlib`. 

2901 block_size : `int` 

2902 Number of bytes to read from file at one time. 

2903 

2904 Returns 

2905 ------- 

2906 hexdigest : `str` 

2907 Hex digest of the file. 

2908 

2909 Notes 

2910 ----- 

2911 Currently returns None if the URI is for a remote resource. 

2912 """ 

2913 if algorithm not in hashlib.algorithms_guaranteed: 

2914 raise NameError(f"The specified algorithm '{algorithm}' is not supported by hashlib") 

2915 

2916 if not uri.isLocal: 

2917 return None 

2918 

2919 hasher = hashlib.new(algorithm) 

2920 

2921 with uri.as_local() as local_uri: 

2922 with open(local_uri.ospath, "rb") as f: 

2923 for chunk in iter(lambda: f.read(block_size), b""): 

2924 hasher.update(chunk) 

2925 

2926 return hasher.hexdigest() 

2927 

2928 def needs_expanded_data_ids( 

2929 self, 

2930 transfer: str | None, 

2931 entity: DatasetRef | DatasetType | StorageClass | None = None, 

2932 ) -> bool: 

2933 # Docstring inherited. 

2934 # This _could_ also use entity to inspect whether the filename template 

2935 # involves placeholders other than the required dimensions for its 

2936 # dataset type, but that's not necessary for correctness; it just 

2937 # enables more optimizations (perhaps only in theory). 

2938 return transfer not in ("direct", None) 

2939 

2940 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2941 # Docstring inherited from the base class. 

2942 record_data = data.get(self.name) 

2943 if not record_data: 

2944 return 

2945 

2946 self._bridge.insert(FakeDatasetRef(dataset_id) for dataset_id in record_data.records.keys()) 

2947 

2948 # TODO: Verify that there are no unexpected table names in the dict? 

2949 unpacked_records = [] 

2950 for dataset_data in record_data.records.values(): 

2951 records = dataset_data.get(self._table.name) 

2952 if records: 

2953 for info in records: 

2954 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2955 unpacked_records.append(info.to_record()) 

2956 if unpacked_records: 

2957 self._table.insert(*unpacked_records, transaction=self._transaction) 

2958 

2959 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2960 # Docstring inherited from the base class. 

2961 exported_refs = list(self._bridge.check(refs)) 

2962 ids = {ref.id for ref in exported_refs} 

2963 records: dict[DatasetId, dict[str, list[StoredDatastoreItemInfo]]] = {id: {} for id in ids} 

2964 for row in self._table.fetch(dataset_id=ids): 

2965 info: StoredDatastoreItemInfo = StoredFileInfo.from_record(row) 

2966 dataset_records = records.setdefault(info.dataset_id, {}) 

2967 dataset_records.setdefault(self._table.name, []).append(info) 

2968 

2969 record_data = DatastoreRecordData(records=records) 

2970 return {self.name: record_data} 

2971 

2972 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

2973 # Docstring inherited from the base class. 

2974 self._retrieve_dataset_method = method 

2975 

2976 def _cast_storage_class(self, ref: DatasetRef) -> DatasetRef: 

2977 """Update dataset reference to use the storage class from registry.""" 

2978 if self._retrieve_dataset_method is None: 

2979 # We could raise an exception here but unit tests do not define 

2980 # this method. 

2981 return ref 

2982 dataset_type = self._retrieve_dataset_method(ref.datasetType.name) 

2983 if dataset_type is not None: 

2984 ref = ref.overrideStorageClass(dataset_type.storageClass) 

2985 return ref