Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 83%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

840 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Type, 

43 Union, 

44) 

45 

46from lsst.daf.butler import ( 

47 CompositesMap, 

48 Config, 

49 DatasetId, 

50 DatasetRef, 

51 DatasetType, 

52 DatasetTypeNotSupportedError, 

53 Datastore, 

54 DatastoreCacheManager, 

55 DatastoreConfig, 

56 DatastoreDisabledCacheManager, 

57 DatastoreValidationError, 

58 FileDataset, 

59 FileDescriptor, 

60 FileTemplates, 

61 FileTemplateValidationError, 

62 Formatter, 

63 FormatterFactory, 

64 Location, 

65 LocationFactory, 

66 Progress, 

67 StorageClass, 

68 StoredFileInfo, 

69 ddl, 

70) 

71from lsst.daf.butler.core.repoRelocation import replaceRoot 

72from lsst.daf.butler.core.utils import transactional 

73from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

74from lsst.resources import ResourcePath, ResourcePathExpression 

75from lsst.utils.introspection import get_class_of, get_instance_of 

76from lsst.utils.iteration import chunk_iterable 

77 

78# For VERBOSE logging usage. 

79from lsst.utils.logging import VERBOSE, getLogger 

80from lsst.utils.timer import time_this 

81from sqlalchemy import BigInteger, String 

82 

83from .genericDatastore import GenericBaseDatastore 

84 

85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

88 

89log = getLogger(__name__) 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 

101 def __init__(self, datasets: List[FileDataset]): 

102 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

103 self.datasets = datasets 

104 

105 

106@dataclass(frozen=True) 

107class DatastoreFileGetInformation: 

108 """Collection of useful parameters needed to retrieve a file from 

109 a Datastore. 

110 """ 

111 

112 location: Location 

113 """The location from which to read the dataset.""" 

114 

115 formatter: Formatter 

116 """The `Formatter` to use to deserialize the dataset.""" 

117 

118 info: StoredFileInfo 

119 """Stored information about this file and its formatter.""" 

120 

121 assemblerParams: Dict[str, Any] 

122 """Parameters to use for post-processing the retrieved dataset.""" 

123 

124 formatterParams: Dict[str, Any] 

125 """Parameters that were understood by the associated formatter.""" 

126 

127 component: Optional[str] 

128 """The component to be retrieved (can be `None`).""" 

129 

130 readStorageClass: StorageClass 

131 """The `StorageClass` of the dataset being read.""" 

132 

133 

134class FileDatastore(GenericBaseDatastore): 

135 """Generic Datastore for file-based implementations. 

136 

137 Should always be sub-classed since key abstract methods are missing. 

138 

139 Parameters 

140 ---------- 

141 config : `DatastoreConfig` or `str` 

142 Configuration as either a `Config` object or URI to file. 

143 bridgeManager : `DatastoreRegistryBridgeManager` 

144 Object that manages the interface between `Registry` and datastores. 

145 butlerRoot : `str`, optional 

146 New datastore root to use to override the configuration value. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 If root location does not exist and ``create`` is `False` in the 

152 configuration. 

153 """ 

154 

155 defaultConfigFile: ClassVar[Optional[str]] = None 

156 """Path to configuration defaults. Accessed within the ``config`` resource 

157 or relative to a search path. Can be None if no defaults specified. 

158 """ 

159 

160 root: ResourcePath 

161 """Root directory URI of this `Datastore`.""" 

162 

163 locationFactory: LocationFactory 

164 """Factory for creating locations relative to the datastore root.""" 

165 

166 formatterFactory: FormatterFactory 

167 """Factory for creating instances of formatters.""" 

168 

169 templates: FileTemplates 

170 """File templates that can be used by this `Datastore`.""" 

171 

172 composites: CompositesMap 

173 """Determines whether a dataset should be disassembled on put.""" 

174 

175 defaultConfigFile = "datastores/fileDatastore.yaml" 

176 """Path to configuration defaults. Accessed within the ``config`` resource 

177 or relative to a search path. Can be None if no defaults specified. 

178 """ 

179 

180 @classmethod 

181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

182 """Set any filesystem-dependent config options for this Datastore to 

183 be appropriate for a new empty repository with the given root. 

184 

185 Parameters 

186 ---------- 

187 root : `str` 

188 URI to the root of the data repository. 

189 config : `Config` 

190 A `Config` to update. Only the subset understood by 

191 this component will be updated. Will not expand 

192 defaults. 

193 full : `Config` 

194 A complete config with all defaults expanded that can be 

195 converted to a `DatastoreConfig`. Read-only and will not be 

196 modified by this method. 

197 Repository-specific options that should not be obtained 

198 from defaults when Butler instances are constructed 

199 should be copied from ``full`` to ``config``. 

200 overwrite : `bool`, optional 

201 If `False`, do not modify a value in ``config`` if the value 

202 already exists. Default is always to overwrite with the provided 

203 ``root``. 

204 

205 Notes 

206 ----- 

207 If a keyword is explicitly defined in the supplied ``config`` it 

208 will not be overridden by this method if ``overwrite`` is `False`. 

209 This allows explicit values set in external configs to be retained. 

210 """ 

211 Config.updateParameters( 

212 DatastoreConfig, 

213 config, 

214 full, 

215 toUpdate={"root": root}, 

216 toCopy=("cls", ("records", "table")), 

217 overwrite=overwrite, 

218 ) 

219 

220 @classmethod 

221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

222 return ddl.TableSpec( 

223 fields=[ 

224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

228 # Use empty string to indicate no component 

229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

230 # TODO: should checksum be Base64Bytes instead? 

231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

233 ], 

234 unique=frozenset(), 

235 indexes=[tuple(["path"])], 

236 ) 

237 

238 def __init__( 

239 self, 

240 config: Union[DatastoreConfig, str], 

241 bridgeManager: DatastoreRegistryBridgeManager, 

242 butlerRoot: str = None, 

243 ): 

244 super().__init__(config, bridgeManager) 

245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 raise ValueError("No root directory specified in configuration") 

247 

248 # Name ourselves either using an explicit name or a name 

249 # derived from the (unexpanded) root 

250 if "name" in self.config: 

251 self.name = self.config["name"] 

252 else: 

253 # We use the unexpanded root in the name to indicate that this 

254 # datastore can be moved without having to update registry. 

255 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

256 

257 # Support repository relocation in config 

258 # Existence of self.root is checked in subclass 

259 self.root = ResourcePath( 

260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

261 ) 

262 

263 self.locationFactory = LocationFactory(self.root) 

264 self.formatterFactory = FormatterFactory() 

265 

266 # Now associate formatters with storage classes 

267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

268 

269 # Read the file naming templates 

270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

271 

272 # See if composites should be disassembled 

273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

274 

275 tableName = self.config["records", "table"] 

276 try: 

277 # Storage of paths and formatters, keyed by dataset_id 

278 self._table = bridgeManager.opaque.register( 

279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

280 ) 

281 # Interface to Registry. 

282 self._bridge = bridgeManager.register(self.name) 

283 except ReadOnlyDatabaseError: 

284 # If the database is read only and we just tried and failed to 

285 # create a table, it means someone is trying to create a read-only 

286 # butler client for an empty repo. That should be okay, as long 

287 # as they then try to get any datasets before some other client 

288 # creates the table. Chances are they'rejust validating 

289 # configuration. 

290 pass 

291 

292 # Determine whether checksums should be used - default to False 

293 self.useChecksum = self.config.get("checksum", False) 

294 

295 # Determine whether we can fall back to configuration if a 

296 # requested dataset is not known to registry 

297 self.trustGetRequest = self.config.get("trust_get_request", False) 

298 

299 # Create a cache manager 

300 self.cacheManager: AbstractDatastoreCacheManager 

301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false

302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

303 else: 

304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

305 

306 # Check existence and create directory structure if necessary 

307 if not self.root.exists(): 

308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true

309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

310 try: 

311 self.root.mkdir() 

312 except Exception as e: 

313 raise ValueError( 

314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

315 ) from e 

316 

317 def __str__(self) -> str: 

318 return str(self.root) 

319 

320 @property 

321 def bridge(self) -> DatastoreRegistryBridge: 

322 return self._bridge 

323 

324 def _artifact_exists(self, location: Location) -> bool: 

325 """Check that an artifact exists in this datastore at the specified 

326 location. 

327 

328 Parameters 

329 ---------- 

330 location : `Location` 

331 Expected location of the artifact associated with this datastore. 

332 

333 Returns 

334 ------- 

335 exists : `bool` 

336 True if the location can be found, false otherwise. 

337 """ 

338 log.debug("Checking if resource exists: %s", location.uri) 

339 return location.uri.exists() 

340 

341 def _delete_artifact(self, location: Location) -> None: 

342 """Delete the artifact from the datastore. 

343 

344 Parameters 

345 ---------- 

346 location : `Location` 

347 Location of the artifact associated with this datastore. 

348 """ 

349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true

350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

351 

352 try: 

353 location.uri.remove() 

354 except FileNotFoundError: 

355 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

356 raise 

357 except Exception as e: 

358 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

359 raise 

360 log.debug("Successfully deleted file: %s", location.uri) 

361 

362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

363 # Docstring inherited from GenericBaseDatastore 

364 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

365 self._table.insert(*records) 

366 

367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

368 # Docstring inherited from GenericBaseDatastore 

369 

370 # Look for the dataset_id -- there might be multiple matches 

371 # if we have disassembled the dataset. 

372 records = self._table.fetch(dataset_id=ref.id) 

373 return [StoredFileInfo.from_record(record) for record in records] 

374 

375 def _get_stored_records_associated_with_refs( 

376 self, refs: Iterable[DatasetIdRef] 

377 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

378 """Retrieve all records associated with the provided refs. 

379 

380 Parameters 

381 ---------- 

382 refs : iterable of `DatasetIdRef` 

383 The refs for which records are to be retrieved. 

384 

385 Returns 

386 ------- 

387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

388 The matching records indexed by the ref ID. The number of entries 

389 in the dict can be smaller than the number of requested refs. 

390 """ 

391 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

392 

393 # Uniqueness is dataset_id + component so can have multiple records 

394 # per ref. 

395 records_by_ref = defaultdict(list) 

396 for record in records: 

397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

398 return records_by_ref 

399 

400 def _refs_associated_with_artifacts( 

401 self, paths: List[Union[str, ResourcePath]] 

402 ) -> Dict[str, Set[DatasetId]]: 

403 """Return paths and associated dataset refs. 

404 

405 Parameters 

406 ---------- 

407 paths : `list` of `str` or `lsst.resources.ResourcePath` 

408 All the paths to include in search. 

409 

410 Returns 

411 ------- 

412 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

413 Mapping of each path to a set of associated database IDs. 

414 """ 

415 records = self._table.fetch(path=[str(path) for path in paths]) 

416 result = defaultdict(set) 

417 for row in records: 

418 result[row["path"]].add(row["dataset_id"]) 

419 return result 

420 

421 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

422 """Return all dataset refs associated with the supplied path. 

423 

424 Parameters 

425 ---------- 

426 pathInStore : `lsst.resources.ResourcePath` 

427 Path of interest in the data store. 

428 

429 Returns 

430 ------- 

431 ids : `set` of `int` 

432 All `DatasetRef` IDs associated with this path. 

433 """ 

434 records = list(self._table.fetch(path=str(pathInStore))) 

435 ids = {r["dataset_id"] for r in records} 

436 return ids 

437 

438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

439 # Docstring inherited from GenericBaseDatastore 

440 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

441 

442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

443 r"""Find all the `Location`\ s of the requested dataset in the 

444 `Datastore` and the associated stored file information. 

445 

446 Parameters 

447 ---------- 

448 ref : `DatasetRef` 

449 Reference to the required `Dataset`. 

450 

451 Returns 

452 ------- 

453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

454 Location of the dataset within the datastore and 

455 stored information about each file and its formatter. 

456 """ 

457 # Get the file information (this will fail if no file) 

458 records = self.getStoredItemsInfo(ref) 

459 

460 # Use the path to determine the location -- we need to take 

461 # into account absolute URIs in the datastore record 

462 return [(r.file_location(self.locationFactory), r) for r in records] 

463 

464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

465 """Check that there is only one dataset associated with the 

466 specified artifact. 

467 

468 Parameters 

469 ---------- 

470 ref : `DatasetRef` or `FakeDatasetRef` 

471 Dataset to be removed. 

472 location : `Location` 

473 The location of the artifact to be removed. 

474 

475 Returns 

476 ------- 

477 can_remove : `Bool` 

478 True if the artifact can be safely removed. 

479 """ 

480 # Can't ever delete absolute URIs. 

481 if location.pathInStore.isabs(): 

482 return False 

483 

484 # Get all entries associated with this path 

485 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

486 if not allRefs: 

487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

488 

489 # Remove these refs from all the refs and if there is nothing left 

490 # then we can delete 

491 remainingRefs = allRefs - {ref.id} 

492 

493 if remainingRefs: 

494 return False 

495 return True 

496 

497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

498 """Predict the location and related file information of the requested 

499 dataset in this datastore. 

500 

501 Parameters 

502 ---------- 

503 ref : `DatasetRef` 

504 Reference to the required `Dataset`. 

505 

506 Returns 

507 ------- 

508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

509 Expected Location of the dataset within the datastore and 

510 placeholder information about each file and its formatter. 

511 

512 Notes 

513 ----- 

514 Uses the current configuration to determine how we would expect the 

515 datastore files to have been written if we couldn't ask registry. 

516 This is safe so long as there has been no change to datastore 

517 configuration between writing the dataset and wanting to read it. 

518 Will not work for files that have been ingested without using the 

519 standard file template or default formatter. 

520 """ 

521 

522 # If we have a component ref we always need to ask the questions 

523 # of the composite. If the composite is disassembled this routine 

524 # should return all components. If the composite was not 

525 # disassembled the composite is what is stored regardless of 

526 # component request. Note that if the caller has disassembled 

527 # a composite there is no way for this guess to know that 

528 # without trying both the composite and component ref and seeing 

529 # if there is something at the component Location even without 

530 # disassembly being enabled. 

531 if ref.datasetType.isComponent(): 

532 ref = ref.makeCompositeRef() 

533 

534 # See if the ref is a composite that should be disassembled 

535 doDisassembly = self.composites.shouldBeDisassembled(ref) 

536 

537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

538 

539 if doDisassembly: 

540 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

541 compRef = ref.makeComponentRef(component) 

542 location, formatter = self._determine_put_formatter_location(compRef) 

543 all_info.append((location, formatter, componentStorage, component)) 

544 

545 else: 

546 # Always use the composite ref if no disassembly 

547 location, formatter = self._determine_put_formatter_location(ref) 

548 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

549 

550 # Convert the list of tuples to have StoredFileInfo as second element 

551 return [ 

552 ( 

553 location, 

554 StoredFileInfo( 

555 formatter=formatter, 

556 path=location.pathInStore.path, 

557 storageClass=storageClass, 

558 component=component, 

559 checksum=None, 

560 file_size=-1, 

561 ), 

562 ) 

563 for location, formatter, storageClass, component in all_info 

564 ] 

565 

566 def _prepare_for_get( 

567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

568 ) -> List[DatastoreFileGetInformation]: 

569 """Check parameters for ``get`` and obtain formatter and 

570 location. 

571 

572 Parameters 

573 ---------- 

574 ref : `DatasetRef` 

575 Reference to the required Dataset. 

576 parameters : `dict` 

577 `StorageClass`-specific parameters that specify, for example, 

578 a slice of the dataset to be loaded. 

579 

580 Returns 

581 ------- 

582 getInfo : `list` [`DatastoreFileGetInformation`] 

583 Parameters needed to retrieve each file. 

584 """ 

585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

586 

587 # Get file metadata and internal metadata 

588 fileLocations = self._get_dataset_locations_info(ref) 

589 if not fileLocations: 

590 if not self.trustGetRequest: 

591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

592 # Assume the dataset is where we think it should be 

593 fileLocations = self._get_expected_dataset_locations_info(ref) 

594 

595 # The storage class we want to use eventually 

596 refStorageClass = ref.datasetType.storageClass 

597 

598 if len(fileLocations) > 1: 

599 disassembled = True 

600 

601 # If trust is involved it is possible that there will be 

602 # components listed here that do not exist in the datastore. 

603 # Explicitly check for file artifact existence and filter out any 

604 # that are missing. 

605 if self.trustGetRequest: 

606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

607 

608 # For now complain only if we have no components at all. One 

609 # component is probably a problem but we can punt that to the 

610 # assembler. 

611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true

612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

613 

614 else: 

615 disassembled = False 

616 

617 # Is this a component request? 

618 refComponent = ref.datasetType.component() 

619 

620 fileGetInfo = [] 

621 for location, storedFileInfo in fileLocations: 

622 

623 # The storage class used to write the file 

624 writeStorageClass = storedFileInfo.storageClass 

625 

626 # If this has been disassembled we need read to match the write 

627 if disassembled: 

628 readStorageClass = writeStorageClass 

629 else: 

630 readStorageClass = refStorageClass 

631 

632 formatter = get_instance_of( 

633 storedFileInfo.formatter, 

634 FileDescriptor( 

635 location, 

636 readStorageClass=readStorageClass, 

637 storageClass=writeStorageClass, 

638 parameters=parameters, 

639 ), 

640 ref.dataId, 

641 ) 

642 

643 formatterParams, notFormatterParams = formatter.segregateParameters() 

644 

645 # Of the remaining parameters, extract the ones supported by 

646 # this StorageClass (for components not all will be handled) 

647 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

648 

649 # The ref itself could be a component if the dataset was 

650 # disassembled by butler, or we disassembled in datastore and 

651 # components came from the datastore records 

652 component = storedFileInfo.component if storedFileInfo.component else refComponent 

653 

654 fileGetInfo.append( 

655 DatastoreFileGetInformation( 

656 location, 

657 formatter, 

658 storedFileInfo, 

659 assemblerParams, 

660 formatterParams, 

661 component, 

662 readStorageClass, 

663 ) 

664 ) 

665 

666 return fileGetInfo 

667 

668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

669 """Check the arguments for ``put`` and obtain formatter and 

670 location. 

671 

672 Parameters 

673 ---------- 

674 inMemoryDataset : `object` 

675 The dataset to store. 

676 ref : `DatasetRef` 

677 Reference to the associated Dataset. 

678 

679 Returns 

680 ------- 

681 location : `Location` 

682 The location to write the dataset. 

683 formatter : `Formatter` 

684 The `Formatter` to use to write the dataset. 

685 

686 Raises 

687 ------ 

688 TypeError 

689 Supplied object and storage class are inconsistent. 

690 DatasetTypeNotSupportedError 

691 The associated `DatasetType` is not handled by this datastore. 

692 """ 

693 self._validate_put_parameters(inMemoryDataset, ref) 

694 return self._determine_put_formatter_location(ref) 

695 

696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

697 """Calculate the formatter and output location to use for put. 

698 

699 Parameters 

700 ---------- 

701 ref : `DatasetRef` 

702 Reference to the associated Dataset. 

703 

704 Returns 

705 ------- 

706 location : `Location` 

707 The location to write the dataset. 

708 formatter : `Formatter` 

709 The `Formatter` to use to write the dataset. 

710 """ 

711 # Work out output file name 

712 try: 

713 template = self.templates.getTemplate(ref) 

714 except KeyError as e: 

715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

716 

717 # Validate the template to protect against filenames from different 

718 # dataIds returning the same and causing overwrite confusion. 

719 template.validateTemplate(ref) 

720 

721 location = self.locationFactory.fromPath(template.format(ref)) 

722 

723 # Get the formatter based on the storage class 

724 storageClass = ref.datasetType.storageClass 

725 try: 

726 formatter = self.formatterFactory.getFormatter( 

727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

728 ) 

729 except KeyError as e: 

730 raise DatasetTypeNotSupportedError( 

731 f"Unable to find formatter for {ref} in datastore {self.name}" 

732 ) from e 

733 

734 # Now that we know the formatter, update the location 

735 location = formatter.makeUpdatedLocation(location) 

736 

737 return location, formatter 

738 

739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

740 # Docstring inherited from base class 

741 if transfer != "auto": 

742 return transfer 

743 

744 # See if the paths are within the datastore or not 

745 inside = [self._pathInStore(d.path) is not None for d in datasets] 

746 

747 if all(inside): 

748 transfer = None 

749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false

750 # Allow ResourcePath to use its own knowledge 

751 transfer = "auto" 

752 else: 

753 # This can happen when importing from a datastore that 

754 # has had some datasets ingested using "direct" mode. 

755 # Also allow ResourcePath to sort it out but warn about it. 

756 # This can happen if you are importing from a datastore 

757 # that had some direct transfer datasets. 

758 log.warning( 

759 "Some datasets are inside the datastore and some are outside. Using 'split' " 

760 "transfer mode. This assumes that the files outside the datastore are " 

761 "still accessible to the new butler since they will not be copied into " 

762 "the target datastore." 

763 ) 

764 transfer = "split" 

765 

766 return transfer 

767 

768 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

769 """Return path relative to datastore root 

770 

771 Parameters 

772 ---------- 

773 path : `lsst.resources.ResourcePathExpression` 

774 Path to dataset. Can be absolute URI. If relative assumed to 

775 be relative to the datastore. Returns path in datastore 

776 or raises an exception if the path it outside. 

777 

778 Returns 

779 ------- 

780 inStore : `str` 

781 Path relative to datastore root. Returns `None` if the file is 

782 outside the root. 

783 """ 

784 # Relative path will always be relative to datastore 

785 pathUri = ResourcePath(path, forceAbsolute=False) 

786 return pathUri.relative_to(self.root) 

787 

788 def _standardizeIngestPath( 

789 self, path: ResourcePathExpression, *, transfer: Optional[str] = None 

790 ) -> Union[str, ResourcePath]: 

791 """Standardize the path of a to-be-ingested file. 

792 

793 Parameters 

794 ---------- 

795 path : `lsst.resources.ResourcePathExpression` 

796 Path of a file to be ingested. 

797 transfer : `str`, optional 

798 How (and whether) the dataset should be added to the datastore. 

799 See `ingest` for details of transfer modes. 

800 This implementation is provided only so 

801 `NotImplementedError` can be raised if the mode is not supported; 

802 actual transfers are deferred to `_extractIngestInfo`. 

803 

804 Returns 

805 ------- 

806 path : `str` or `lsst.resources.ResourcePath` 

807 New path in what the datastore considers standard form. If an 

808 absolute URI was given that will be returned unchanged. 

809 

810 Notes 

811 ----- 

812 Subclasses of `FileDatastore` can implement this method instead 

813 of `_prepIngest`. It should not modify the data repository or given 

814 file in any way. 

815 

816 Raises 

817 ------ 

818 NotImplementedError 

819 Raised if the datastore does not support the given transfer mode 

820 (including the case where ingest is not supported at all). 

821 FileNotFoundError 

822 Raised if one of the given files does not exist. 

823 """ 

824 if transfer not in (None, "direct", "split") + self.root.transferModes: 824 ↛ 825line 824 didn't jump to line 825, because the condition on line 824 was never true

825 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

826 

827 # A relative URI indicates relative to datastore root 

828 srcUri = ResourcePath(path, forceAbsolute=False) 

829 if not srcUri.isabs(): 

830 srcUri = self.root.join(path) 

831 

832 if not srcUri.exists(): 

833 raise FileNotFoundError( 

834 f"Resource at {srcUri} does not exist; note that paths to ingest " 

835 f"are assumed to be relative to {self.root} unless they are absolute." 

836 ) 

837 

838 if transfer is None: 

839 relpath = srcUri.relative_to(self.root) 

840 if not relpath: 

841 raise RuntimeError( 

842 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

843 ) 

844 

845 # Return the relative path within the datastore for internal 

846 # transfer 

847 path = relpath 

848 

849 return path 

850 

851 def _extractIngestInfo( 

852 self, 

853 path: ResourcePathExpression, 

854 ref: DatasetRef, 

855 *, 

856 formatter: Union[Formatter, Type[Formatter]], 

857 transfer: Optional[str] = None, 

858 ) -> StoredFileInfo: 

859 """Relocate (if necessary) and extract `StoredFileInfo` from a 

860 to-be-ingested file. 

861 

862 Parameters 

863 ---------- 

864 path : `lsst.resources.ResourcePathExpression` 

865 URI or path of a file to be ingested. 

866 ref : `DatasetRef` 

867 Reference for the dataset being ingested. Guaranteed to have 

868 ``dataset_id not None`. 

869 formatter : `type` or `Formatter` 

870 `Formatter` subclass to use for this dataset or an instance. 

871 transfer : `str`, optional 

872 How (and whether) the dataset should be added to the datastore. 

873 See `ingest` for details of transfer modes. 

874 

875 Returns 

876 ------- 

877 info : `StoredFileInfo` 

878 Internal datastore record for this file. This will be inserted by 

879 the caller; the `_extractIngestInfo` is only responsible for 

880 creating and populating the struct. 

881 

882 Raises 

883 ------ 

884 FileNotFoundError 

885 Raised if one of the given files does not exist. 

886 FileExistsError 

887 Raised if transfer is not `None` but the (internal) location the 

888 file would be moved to is already occupied. 

889 """ 

890 if self._transaction is None: 890 ↛ 891line 890 didn't jump to line 891, because the condition on line 890 was never true

891 raise RuntimeError("Ingest called without transaction enabled") 

892 

893 # Create URI of the source path, do not need to force a relative 

894 # path to absolute. 

895 srcUri = ResourcePath(path, forceAbsolute=False) 

896 

897 # Track whether we have read the size of the source yet 

898 have_sized = False 

899 

900 tgtLocation: Optional[Location] 

901 if transfer is None or transfer == "split": 

902 # A relative path is assumed to be relative to the datastore 

903 # in this context 

904 if not srcUri.isabs(): 

905 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

906 else: 

907 # Work out the path in the datastore from an absolute URI 

908 # This is required to be within the datastore. 

909 pathInStore = srcUri.relative_to(self.root) 

910 if pathInStore is None and transfer is None: 910 ↛ 911line 910 didn't jump to line 911, because the condition on line 910 was never true

911 raise RuntimeError( 

912 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

913 ) 

914 if pathInStore: 914 ↛ 916line 914 didn't jump to line 916, because the condition on line 914 was never false

915 tgtLocation = self.locationFactory.fromPath(pathInStore) 

916 elif transfer == "split": 

917 # Outside the datastore but treat that as a direct ingest 

918 # instead. 

919 tgtLocation = None 

920 else: 

921 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

922 elif transfer == "direct": 922 ↛ 927line 922 didn't jump to line 927, because the condition on line 922 was never true

923 # Want to store the full URI to the resource directly in 

924 # datastore. This is useful for referring to permanent archive 

925 # storage for raw data. 

926 # Trust that people know what they are doing. 

927 tgtLocation = None 

928 else: 

929 # Work out the name we want this ingested file to have 

930 # inside the datastore 

931 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

932 if not tgtLocation.uri.dirname().exists(): 

933 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

934 tgtLocation.uri.dirname().mkdir() 

935 

936 # if we are transferring from a local file to a remote location 

937 # it may be more efficient to get the size and checksum of the 

938 # local file rather than the transferred one 

939 if not srcUri.scheme or srcUri.scheme == "file": 939 ↛ 949line 939 didn't jump to line 949, because the condition on line 939 was never false

940 size = srcUri.size() 

941 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

942 have_sized = True 

943 

944 # Transfer the resource to the destination. 

945 # Allow overwrite of an existing file. This matches the behavior 

946 # of datastore.put() in that it trusts that registry would not 

947 # be asking to overwrite unless registry thought that the 

948 # overwrite was allowed. 

949 tgtLocation.uri.transfer_from( 

950 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

951 ) 

952 

953 if tgtLocation is None: 953 ↛ 955line 953 didn't jump to line 955, because the condition on line 953 was never true

954 # This means we are using direct mode 

955 targetUri = srcUri 

956 targetPath = str(srcUri) 

957 else: 

958 targetUri = tgtLocation.uri 

959 targetPath = tgtLocation.pathInStore.path 

960 

961 # the file should exist in the datastore now 

962 if not have_sized: 

963 size = targetUri.size() 

964 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

965 

966 return StoredFileInfo( 

967 formatter=formatter, 

968 path=targetPath, 

969 storageClass=ref.datasetType.storageClass, 

970 component=ref.datasetType.component(), 

971 file_size=size, 

972 checksum=checksum, 

973 ) 

974 

975 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

976 # Docstring inherited from Datastore._prepIngest. 

977 filtered = [] 

978 for dataset in datasets: 

979 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

980 if not acceptable: 

981 continue 

982 else: 

983 dataset.refs = acceptable 

984 if dataset.formatter is None: 

985 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

986 else: 

987 assert isinstance(dataset.formatter, (type, str)) 

988 formatter_class = get_class_of(dataset.formatter) 

989 if not issubclass(formatter_class, Formatter): 989 ↛ 990line 989 didn't jump to line 990, because the condition on line 989 was never true

990 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

991 dataset.formatter = formatter_class 

992 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

993 filtered.append(dataset) 

994 return _IngestPrepData(filtered) 

995 

996 @transactional 

997 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

998 # Docstring inherited from Datastore._finishIngest. 

999 refsAndInfos = [] 

1000 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1001 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1002 # Do ingest as if the first dataset ref is associated with the file 

1003 info = self._extractIngestInfo( 

1004 dataset.path, dataset.refs[0], formatter=dataset.formatter, transfer=transfer 

1005 ) 

1006 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1007 self._register_datasets(refsAndInfos) 

1008 

1009 def _calculate_ingested_datastore_name( 

1010 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1011 ) -> Location: 

1012 """Given a source URI and a DatasetRef, determine the name the 

1013 dataset will have inside datastore. 

1014 

1015 Parameters 

1016 ---------- 

1017 srcUri : `lsst.resources.ResourcePath` 

1018 URI to the source dataset file. 

1019 ref : `DatasetRef` 

1020 Ref associated with the newly-ingested dataset artifact. This 

1021 is used to determine the name within the datastore. 

1022 formatter : `Formatter` or Formatter class. 

1023 Formatter to use for validation. Can be a class or an instance. 

1024 

1025 Returns 

1026 ------- 

1027 location : `Location` 

1028 Target location for the newly-ingested dataset. 

1029 """ 

1030 # Ingesting a file from outside the datastore. 

1031 # This involves a new name. 

1032 template = self.templates.getTemplate(ref) 

1033 location = self.locationFactory.fromPath(template.format(ref)) 

1034 

1035 # Get the extension 

1036 ext = srcUri.getExtension() 

1037 

1038 # Update the destination to include that extension 

1039 location.updateExtension(ext) 

1040 

1041 # Ask the formatter to validate this extension 

1042 formatter.validateExtension(location) 

1043 

1044 return location 

1045 

1046 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1047 """Write out in memory dataset to datastore. 

1048 

1049 Parameters 

1050 ---------- 

1051 inMemoryDataset : `object` 

1052 Dataset to write to datastore. 

1053 ref : `DatasetRef` 

1054 Registry information associated with this dataset. 

1055 

1056 Returns 

1057 ------- 

1058 info : `StoredFileInfo` 

1059 Information describing the artifact written to the datastore. 

1060 """ 

1061 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1062 uri = location.uri 

1063 

1064 if not uri.dirname().exists(): 

1065 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1066 uri.dirname().mkdir() 

1067 

1068 if self._transaction is None: 1068 ↛ 1069line 1068 didn't jump to line 1069, because the condition on line 1068 was never true

1069 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1070 

1071 def _removeFileExists(uri: ResourcePath) -> None: 

1072 """Remove a file and do not complain if it is not there. 

1073 

1074 This is important since a formatter might fail before the file 

1075 is written and we should not confuse people by writing spurious 

1076 error messages to the log. 

1077 """ 

1078 try: 

1079 uri.remove() 

1080 except FileNotFoundError: 

1081 pass 

1082 

1083 # Register a callback to try to delete the uploaded data if 

1084 # something fails below 

1085 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1086 

1087 # For a local file, simply use the formatter directly 

1088 if uri.isLocal: 

1089 try: 

1090 formatter.write(inMemoryDataset) 

1091 except Exception as e: 

1092 raise RuntimeError( 

1093 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}" 

1094 ) from e 

1095 log.debug("Successfully wrote python object to local file at %s", uri) 

1096 else: 

1097 # This is a remote URI. Some datasets can be serialized directly 

1098 # to bytes and sent to the remote datastore without writing a 

1099 # file. If the dataset is intended to be saved to the cache 

1100 # a file is always written and direct write to the remote 

1101 # datastore is bypassed. 

1102 data_written = False 

1103 if not self.cacheManager.should_be_cached(ref): 

1104 try: 

1105 serializedDataset = formatter.toBytes(inMemoryDataset) 

1106 except NotImplementedError: 

1107 # Fallback to the file writing option. 

1108 pass 

1109 except Exception as e: 

1110 raise RuntimeError( 

1111 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1112 ) from e 

1113 else: 

1114 log.debug("Writing bytes directly to %s", uri) 

1115 uri.write(serializedDataset, overwrite=True) 

1116 log.debug("Successfully wrote bytes directly to %s", uri) 

1117 data_written = True 

1118 

1119 if not data_written: 

1120 # Did not write the bytes directly to object store so instead 

1121 # write to temporary file. 

1122 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1123 # Need to configure the formatter to write to a different 

1124 # location and that needs us to overwrite internals 

1125 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1126 with formatter._updateLocation(Location(None, temporary_uri)): 

1127 try: 

1128 formatter.write(inMemoryDataset) 

1129 except Exception as e: 

1130 raise RuntimeError( 

1131 f"Failed to serialize dataset {ref} of type" 

1132 f" {type(inMemoryDataset)} to " 

1133 f"temporary location {temporary_uri}" 

1134 ) from e 

1135 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1136 

1137 # Cache if required 

1138 self.cacheManager.move_to_cache(temporary_uri, ref) 

1139 

1140 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1141 

1142 # URI is needed to resolve what ingest case are we dealing with 

1143 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1144 

1145 def _read_artifact_into_memory( 

1146 self, 

1147 getInfo: DatastoreFileGetInformation, 

1148 ref: DatasetRef, 

1149 isComponent: bool = False, 

1150 cache_ref: Optional[DatasetRef] = None, 

1151 ) -> Any: 

1152 """Read the artifact from datastore into in memory object. 

1153 

1154 Parameters 

1155 ---------- 

1156 getInfo : `DatastoreFileGetInformation` 

1157 Information about the artifact within the datastore. 

1158 ref : `DatasetRef` 

1159 The registry information associated with this artifact. 

1160 isComponent : `bool` 

1161 Flag to indicate if a component is being read from this artifact. 

1162 cache_ref : `DatasetRef`, optional 

1163 The DatasetRef to use when looking up the file in the cache. 

1164 This ref must have the same ID as the supplied ref but can 

1165 be a parent ref or component ref to indicate to the cache whether 

1166 a composite file is being requested from the cache or a component 

1167 file. Without this the cache will default to the supplied ref but 

1168 it can get confused with read-only derived components for 

1169 disassembled composites. 

1170 

1171 Returns 

1172 ------- 

1173 inMemoryDataset : `object` 

1174 The artifact as a python object. 

1175 """ 

1176 location = getInfo.location 

1177 uri = location.uri 

1178 log.debug("Accessing data from %s", uri) 

1179 

1180 if cache_ref is None: 

1181 cache_ref = ref 

1182 if cache_ref.id != ref.id: 1182 ↛ 1183line 1182 didn't jump to line 1183, because the condition on line 1182 was never true

1183 raise ValueError( 

1184 "The supplied cache dataset ref refers to a different dataset than expected:" 

1185 f" {ref.id} != {cache_ref.id}" 

1186 ) 

1187 

1188 # Cannot recalculate checksum but can compare size as a quick check 

1189 # Do not do this if the size is negative since that indicates 

1190 # we do not know. 

1191 recorded_size = getInfo.info.file_size 

1192 resource_size = uri.size() 

1193 if recorded_size >= 0 and resource_size != recorded_size: 1193 ↛ 1194line 1193 didn't jump to line 1194, because the condition on line 1193 was never true

1194 raise RuntimeError( 

1195 "Integrity failure in Datastore. " 

1196 f"Size of file {uri} ({resource_size}) " 

1197 f"does not match size recorded in registry of {recorded_size}" 

1198 ) 

1199 

1200 # For the general case we have choices for how to proceed. 

1201 # 1. Always use a local file (downloading the remote resource to a 

1202 # temporary file if needed). 

1203 # 2. Use a threshold size and read into memory and use bytes. 

1204 # Use both for now with an arbitrary hand off size. 

1205 # This allows small datasets to be downloaded from remote object 

1206 # stores without requiring a temporary file. 

1207 

1208 formatter = getInfo.formatter 

1209 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1210 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1211 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1212 if cached_file is not None: 

1213 desired_uri = cached_file 

1214 msg = f" (cached version of {uri})" 

1215 else: 

1216 desired_uri = uri 

1217 msg = "" 

1218 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1219 serializedDataset = desired_uri.read() 

1220 log.debug( 

1221 "Deserializing %s from %d bytes from location %s with formatter %s", 

1222 f"component {getInfo.component}" if isComponent else "", 

1223 len(serializedDataset), 

1224 uri, 

1225 formatter.name(), 

1226 ) 

1227 try: 

1228 result = formatter.fromBytes( 

1229 serializedDataset, component=getInfo.component if isComponent else None 

1230 ) 

1231 except Exception as e: 

1232 raise ValueError( 

1233 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1234 f" ({ref.datasetType.name} from {uri}): {e}" 

1235 ) from e 

1236 else: 

1237 # Read from file. 

1238 

1239 # Have to update the Location associated with the formatter 

1240 # because formatter.read does not allow an override. 

1241 # This could be improved. 

1242 location_updated = False 

1243 msg = "" 

1244 

1245 # First check in cache for local version. 

1246 # The cache will only be relevant for remote resources but 

1247 # no harm in always asking. Context manager ensures that cache 

1248 # file is not deleted during cache expiration. 

1249 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1250 if cached_file is not None: 

1251 msg = f"(via cache read of remote file {uri})" 

1252 uri = cached_file 

1253 location_updated = True 

1254 

1255 with uri.as_local() as local_uri: 

1256 

1257 can_be_cached = False 

1258 if uri != local_uri: 1258 ↛ 1260line 1258 didn't jump to line 1260, because the condition on line 1258 was never true

1259 # URI was remote and file was downloaded 

1260 cache_msg = "" 

1261 location_updated = True 

1262 

1263 if self.cacheManager.should_be_cached(cache_ref): 

1264 # In this scenario we want to ask if the downloaded 

1265 # file should be cached but we should not cache 

1266 # it until after we've used it (to ensure it can't 

1267 # be expired whilst we are using it). 

1268 can_be_cached = True 

1269 

1270 # Say that it is "likely" to be cached because 

1271 # if the formatter read fails we will not be 

1272 # caching this file. 

1273 cache_msg = " and likely cached" 

1274 

1275 msg = f"(via download to local file{cache_msg})" 

1276 

1277 # Calculate the (possibly) new location for the formatter 

1278 # to use. 

1279 newLocation = Location(*local_uri.split()) if location_updated else None 

1280 

1281 log.debug( 

1282 "Reading%s from location %s %s with formatter %s", 

1283 f" component {getInfo.component}" if isComponent else "", 

1284 uri, 

1285 msg, 

1286 formatter.name(), 

1287 ) 

1288 try: 

1289 with formatter._updateLocation(newLocation): 

1290 with time_this( 

1291 log, 

1292 msg="Reading%s from location %s %s with formatter %s", 

1293 args=( 

1294 f" component {getInfo.component}" if isComponent else "", 

1295 uri, 

1296 msg, 

1297 formatter.name(), 

1298 ), 

1299 ): 

1300 result = formatter.read(component=getInfo.component if isComponent else None) 

1301 except Exception as e: 

1302 raise ValueError( 

1303 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1304 f" ({ref.datasetType.name} from {uri}): {e}" 

1305 ) from e 

1306 

1307 # File was read successfully so can move to cache 

1308 if can_be_cached: 1308 ↛ 1309line 1308 didn't jump to line 1309, because the condition on line 1308 was never true

1309 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1310 

1311 return self._post_process_get( 

1312 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1313 ) 

1314 

1315 def knows(self, ref: DatasetRef) -> bool: 

1316 """Check if the dataset is known to the datastore. 

1317 

1318 Does not check for existence of any artifact. 

1319 

1320 Parameters 

1321 ---------- 

1322 ref : `DatasetRef` 

1323 Reference to the required dataset. 

1324 

1325 Returns 

1326 ------- 

1327 exists : `bool` 

1328 `True` if the dataset is known to the datastore. 

1329 """ 

1330 fileLocations = self._get_dataset_locations_info(ref) 

1331 if fileLocations: 

1332 return True 

1333 return False 

1334 

1335 def _process_mexists_records( 

1336 self, 

1337 id_to_ref: Dict[DatasetId, DatasetRef], 

1338 records: Dict[DatasetId, List[StoredFileInfo]], 

1339 all_required: bool, 

1340 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1341 ) -> Dict[DatasetRef, bool]: 

1342 """Helper function for mexists that checks the given records. 

1343 

1344 Parameters 

1345 ---------- 

1346 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1347 Mapping of the dataset ID to the dataset ref itself. 

1348 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1349 Records as generally returned by 

1350 ``_get_stored_records_associated_with_refs``. 

1351 all_required : `bool` 

1352 Flag to indicate whether existence requires all artifacts 

1353 associated with a dataset ID to exist or not for existence. 

1354 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1355 Optional mapping of datastore artifact to existence. Updated by 

1356 this method with details of all artifacts tested. Can be `None` 

1357 if the caller is not interested. 

1358 

1359 Returns 

1360 ------- 

1361 existence : `dict` of [`DatasetRef`, `bool`] 

1362 Mapping from dataset to boolean indicating existence. 

1363 """ 

1364 # The URIs to be checked and a mapping of those URIs to 

1365 # the dataset ID. 

1366 uris_to_check: List[ResourcePath] = [] 

1367 location_map: Dict[ResourcePath, DatasetId] = {} 

1368 

1369 location_factory = self.locationFactory 

1370 

1371 for ref_id, info in records.items(): 

1372 # Key is the dataId, value is list of StoredItemInfo 

1373 uris = [info.file_location(location_factory).uri for info in info] 

1374 uris_to_check.extend(uris) 

1375 location_map.update({uri: ref_id for uri in uris}) 

1376 

1377 uri_existence: Dict[ResourcePath, bool] = {} 

1378 if artifact_existence is not None: 

1379 # If a URI has already been checked remove it from the list 

1380 # and immediately add the status to the output dict. 

1381 filtered_uris_to_check = [] 

1382 for uri in uris_to_check: 

1383 if uri in artifact_existence: 

1384 uri_existence[uri] = artifact_existence[uri] 

1385 else: 

1386 filtered_uris_to_check.append(uri) 

1387 uris_to_check = filtered_uris_to_check 

1388 

1389 # Results. 

1390 dataset_existence: Dict[DatasetRef, bool] = {} 

1391 

1392 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1393 for uri, exists in uri_existence.items(): 

1394 dataset_id = location_map[uri] 

1395 ref = id_to_ref[dataset_id] 

1396 

1397 # Disassembled composite needs to check all locations. 

1398 # all_required indicates whether all need to exist or not. 

1399 if ref in dataset_existence: 

1400 if all_required: 

1401 exists = dataset_existence[ref] and exists 

1402 else: 

1403 exists = dataset_existence[ref] or exists 

1404 dataset_existence[ref] = exists 

1405 

1406 if artifact_existence is not None: 

1407 artifact_existence.update(uri_existence) 

1408 

1409 return dataset_existence 

1410 

1411 def mexists( 

1412 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1413 ) -> Dict[DatasetRef, bool]: 

1414 """Check the existence of multiple datasets at once. 

1415 

1416 Parameters 

1417 ---------- 

1418 refs : iterable of `DatasetRef` 

1419 The datasets to be checked. 

1420 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1421 Optional mapping of datastore artifact to existence. Updated by 

1422 this method with details of all artifacts tested. Can be `None` 

1423 if the caller is not interested. 

1424 

1425 Returns 

1426 ------- 

1427 existence : `dict` of [`DatasetRef`, `bool`] 

1428 Mapping from dataset to boolean indicating existence. 

1429 """ 

1430 chunk_size = 10_000 

1431 dataset_existence: Dict[DatasetRef, bool] = {} 

1432 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1433 n_found_total = 0 

1434 n_checked = 0 

1435 n_chunks = 0 

1436 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1437 chunk_result = self._mexists(chunk, artifact_existence) 

1438 if log.isEnabledFor(VERBOSE): 

1439 n_results = len(chunk_result) 

1440 n_checked += n_results 

1441 # Can treat the booleans as 0, 1 integers and sum them. 

1442 n_found = sum(chunk_result.values()) 

1443 n_found_total += n_found 

1444 log.verbose( 

1445 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1446 n_chunks, 

1447 n_found, 

1448 n_results, 

1449 n_found_total, 

1450 n_checked, 

1451 ) 

1452 dataset_existence.update(chunk_result) 

1453 n_chunks += 1 

1454 

1455 return dataset_existence 

1456 

1457 def _mexists( 

1458 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1459 ) -> Dict[DatasetRef, bool]: 

1460 """Check the existence of multiple datasets at once. 

1461 

1462 Parameters 

1463 ---------- 

1464 refs : iterable of `DatasetRef` 

1465 The datasets to be checked. 

1466 

1467 Returns 

1468 ------- 

1469 existence : `dict` of [`DatasetRef`, `bool`] 

1470 Mapping from dataset to boolean indicating existence. 

1471 """ 

1472 # Need a mapping of dataset_id to dataset ref since the API 

1473 # works with dataset_id 

1474 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1475 

1476 # Set of all IDs we are checking for. 

1477 requested_ids = set(id_to_ref.keys()) 

1478 

1479 # The records themselves. Could be missing some entries. 

1480 records = self._get_stored_records_associated_with_refs(refs) 

1481 

1482 dataset_existence = self._process_mexists_records( 

1483 id_to_ref, records, True, artifact_existence=artifact_existence 

1484 ) 

1485 

1486 # Set of IDs that have been handled. 

1487 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1488 

1489 missing_ids = requested_ids - handled_ids 

1490 if missing_ids: 

1491 if not self.trustGetRequest: 

1492 # Must assume these do not exist 

1493 for missing in missing_ids: 

1494 dataset_existence[id_to_ref[missing]] = False 

1495 else: 

1496 log.debug( 

1497 "%d out of %d datasets were not known to datastore during initial existence check.", 

1498 len(missing_ids), 

1499 len(requested_ids), 

1500 ) 

1501 

1502 # Construct data structure identical to that returned 

1503 # by _get_stored_records_associated_with_refs() but using 

1504 # guessed names. 

1505 records = {} 

1506 for missing in missing_ids: 

1507 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1508 records[missing] = [info for _, info in expected] 

1509 

1510 dataset_existence.update( 

1511 self._process_mexists_records( 

1512 id_to_ref, records, False, artifact_existence=artifact_existence 

1513 ) 

1514 ) 

1515 

1516 return dataset_existence 

1517 

1518 def exists(self, ref: DatasetRef) -> bool: 

1519 """Check if the dataset exists in the datastore. 

1520 

1521 Parameters 

1522 ---------- 

1523 ref : `DatasetRef` 

1524 Reference to the required dataset. 

1525 

1526 Returns 

1527 ------- 

1528 exists : `bool` 

1529 `True` if the entity exists in the `Datastore`. 

1530 """ 

1531 fileLocations = self._get_dataset_locations_info(ref) 

1532 

1533 # if we are being asked to trust that registry might not be correct 

1534 # we ask for the expected locations and check them explicitly 

1535 if not fileLocations: 

1536 if not self.trustGetRequest: 

1537 return False 

1538 

1539 # When we are guessing a dataset location we can not check 

1540 # for the existence of every component since we can not 

1541 # know if every component was written. Instead we check 

1542 # for the existence of any of the expected locations. 

1543 for location, _ in self._get_expected_dataset_locations_info(ref): 1543 ↛ 1546line 1543 didn't jump to line 1546, because the loop on line 1543 didn't complete

1544 if self._artifact_exists(location): 1544 ↛ 1543line 1544 didn't jump to line 1543, because the condition on line 1544 was never false

1545 return True 

1546 return False 

1547 

1548 # All listed artifacts must exist. 

1549 for location, _ in fileLocations: 

1550 if not self._artifact_exists(location): 

1551 return False 

1552 

1553 return True 

1554 

1555 def getURIs( 

1556 self, ref: DatasetRef, predict: bool = False 

1557 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1558 """Return URIs associated with dataset. 

1559 

1560 Parameters 

1561 ---------- 

1562 ref : `DatasetRef` 

1563 Reference to the required dataset. 

1564 predict : `bool`, optional 

1565 If the datastore does not know about the dataset, should it 

1566 return a predicted URI or not? 

1567 

1568 Returns 

1569 ------- 

1570 primary : `lsst.resources.ResourcePath` 

1571 The URI to the primary artifact associated with this dataset. 

1572 If the dataset was disassembled within the datastore this 

1573 may be `None`. 

1574 components : `dict` 

1575 URIs to any components associated with the dataset artifact. 

1576 Can be empty if there are no components. 

1577 """ 

1578 

1579 primary: Optional[ResourcePath] = None 

1580 components: Dict[str, ResourcePath] = {} 

1581 

1582 # if this has never been written then we have to guess 

1583 if not self.exists(ref): 

1584 if not predict: 

1585 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1586 

1587 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1588 

1589 if doDisassembly: 

1590 

1591 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1592 compRef = ref.makeComponentRef(component) 

1593 compLocation, _ = self._determine_put_formatter_location(compRef) 

1594 

1595 # Add a URI fragment to indicate this is a guess 

1596 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted") 

1597 

1598 else: 

1599 

1600 location, _ = self._determine_put_formatter_location(ref) 

1601 

1602 # Add a URI fragment to indicate this is a guess 

1603 primary = ResourcePath(location.uri.geturl() + "#predicted") 

1604 

1605 return primary, components 

1606 

1607 # If this is a ref that we have written we can get the path. 

1608 # Get file metadata and internal metadata 

1609 fileLocations = self._get_dataset_locations_info(ref) 

1610 

1611 guessing = False 

1612 if not fileLocations: 

1613 if not self.trustGetRequest: 1613 ↛ 1614line 1613 didn't jump to line 1614, because the condition on line 1613 was never true

1614 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1615 fileLocations = self._get_expected_dataset_locations_info(ref) 

1616 guessing = True 

1617 

1618 if len(fileLocations) == 1: 

1619 # No disassembly so this is the primary URI 

1620 uri = fileLocations[0][0].uri 

1621 if guessing and not uri.exists(): 1621 ↛ 1622line 1621 didn't jump to line 1622, because the condition on line 1621 was never true

1622 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1623 primary = uri 

1624 

1625 else: 

1626 for location, storedFileInfo in fileLocations: 

1627 if storedFileInfo.component is None: 1627 ↛ 1628line 1627 didn't jump to line 1628, because the condition on line 1627 was never true

1628 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1629 uri = location.uri 

1630 if guessing and not uri.exists(): 1630 ↛ 1634line 1630 didn't jump to line 1634, because the condition on line 1630 was never true

1631 # If we are trusting then it is entirely possible for 

1632 # some components to be missing. In that case we skip 

1633 # to the next component. 

1634 if self.trustGetRequest: 

1635 continue 

1636 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1637 components[storedFileInfo.component] = uri 

1638 

1639 return primary, components 

1640 

1641 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1642 """URI to the Dataset. 

1643 

1644 Parameters 

1645 ---------- 

1646 ref : `DatasetRef` 

1647 Reference to the required Dataset. 

1648 predict : `bool` 

1649 If `True`, allow URIs to be returned of datasets that have not 

1650 been written. 

1651 

1652 Returns 

1653 ------- 

1654 uri : `str` 

1655 URI pointing to the dataset within the datastore. If the 

1656 dataset does not exist in the datastore, and if ``predict`` is 

1657 `True`, the URI will be a prediction and will include a URI 

1658 fragment "#predicted". 

1659 If the datastore does not have entities that relate well 

1660 to the concept of a URI the returned URI will be 

1661 descriptive. The returned URI is not guaranteed to be obtainable. 

1662 

1663 Raises 

1664 ------ 

1665 FileNotFoundError 

1666 Raised if a URI has been requested for a dataset that does not 

1667 exist and guessing is not allowed. 

1668 RuntimeError 

1669 Raised if a request is made for a single URI but multiple URIs 

1670 are associated with this dataset. 

1671 

1672 Notes 

1673 ----- 

1674 When a predicted URI is requested an attempt will be made to form 

1675 a reasonable URI based on file templates and the expected formatter. 

1676 """ 

1677 primary, components = self.getURIs(ref, predict) 

1678 if primary is None or components: 1678 ↛ 1679line 1678 didn't jump to line 1679, because the condition on line 1678 was never true

1679 raise RuntimeError( 

1680 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1681 ) 

1682 return primary 

1683 

1684 def retrieveArtifacts( 

1685 self, 

1686 refs: Iterable[DatasetRef], 

1687 destination: ResourcePath, 

1688 transfer: str = "auto", 

1689 preserve_path: bool = True, 

1690 overwrite: bool = False, 

1691 ) -> List[ResourcePath]: 

1692 """Retrieve the file artifacts associated with the supplied refs. 

1693 

1694 Parameters 

1695 ---------- 

1696 refs : iterable of `DatasetRef` 

1697 The datasets for which file artifacts are to be retrieved. 

1698 A single ref can result in multiple files. The refs must 

1699 be resolved. 

1700 destination : `lsst.resources.ResourcePath` 

1701 Location to write the file artifacts. 

1702 transfer : `str`, optional 

1703 Method to use to transfer the artifacts. Must be one of the options 

1704 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1705 "move" is not allowed. 

1706 preserve_path : `bool`, optional 

1707 If `True` the full path of the file artifact within the datastore 

1708 is preserved. If `False` the final file component of the path 

1709 is used. 

1710 overwrite : `bool`, optional 

1711 If `True` allow transfers to overwrite existing files at the 

1712 destination. 

1713 

1714 Returns 

1715 ------- 

1716 targets : `list` of `lsst.resources.ResourcePath` 

1717 URIs of file artifacts in destination location. Order is not 

1718 preserved. 

1719 """ 

1720 if not destination.isdir(): 1720 ↛ 1721line 1720 didn't jump to line 1721, because the condition on line 1720 was never true

1721 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1722 

1723 if transfer == "move": 

1724 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1725 

1726 # Source -> Destination 

1727 # This also helps filter out duplicate DatasetRef in the request 

1728 # that will map to the same underlying file transfer. 

1729 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1730 

1731 for ref in refs: 

1732 locations = self._get_dataset_locations_info(ref) 

1733 for location, _ in locations: 

1734 source_uri = location.uri 

1735 target_path: ResourcePathExpression 

1736 if preserve_path: 

1737 target_path = location.pathInStore 

1738 if target_path.isabs(): 1738 ↛ 1741line 1738 didn't jump to line 1741, because the condition on line 1738 was never true

1739 # This is an absolute path to an external file. 

1740 # Use the full path. 

1741 target_path = target_path.relativeToPathRoot 

1742 else: 

1743 target_path = source_uri.basename() 

1744 target_uri = destination.join(target_path) 

1745 to_transfer[source_uri] = target_uri 

1746 

1747 # In theory can now parallelize the transfer 

1748 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1749 for source_uri, target_uri in to_transfer.items(): 

1750 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1751 

1752 return list(to_transfer.values()) 

1753 

1754 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1755 """Load an InMemoryDataset from the store. 

1756 

1757 Parameters 

1758 ---------- 

1759 ref : `DatasetRef` 

1760 Reference to the required Dataset. 

1761 parameters : `dict` 

1762 `StorageClass`-specific parameters that specify, for example, 

1763 a slice of the dataset to be loaded. 

1764 

1765 Returns 

1766 ------- 

1767 inMemoryDataset : `object` 

1768 Requested dataset or slice thereof as an InMemoryDataset. 

1769 

1770 Raises 

1771 ------ 

1772 FileNotFoundError 

1773 Requested dataset can not be retrieved. 

1774 TypeError 

1775 Return value from formatter has unexpected type. 

1776 ValueError 

1777 Formatter failed to process the dataset. 

1778 """ 

1779 allGetInfo = self._prepare_for_get(ref, parameters) 

1780 refComponent = ref.datasetType.component() 

1781 

1782 # Supplied storage class for the component being read 

1783 refStorageClass = ref.datasetType.storageClass 

1784 

1785 # Create mapping from component name to related info 

1786 allComponents = {i.component: i for i in allGetInfo} 

1787 

1788 # By definition the dataset is disassembled if we have more 

1789 # than one record for it. 

1790 isDisassembled = len(allGetInfo) > 1 

1791 

1792 # Look for the special case where we are disassembled but the 

1793 # component is a derived component that was not written during 

1794 # disassembly. For this scenario we need to check that the 

1795 # component requested is listed as a derived component for the 

1796 # composite storage class 

1797 isDisassembledReadOnlyComponent = False 

1798 if isDisassembled and refComponent: 

1799 # The composite storage class should be accessible through 

1800 # the component dataset type 

1801 compositeStorageClass = ref.datasetType.parentStorageClass 

1802 

1803 # In the unlikely scenario where the composite storage 

1804 # class is not known, we can only assume that this is a 

1805 # normal component. If that assumption is wrong then the 

1806 # branch below that reads a persisted component will fail 

1807 # so there is no need to complain here. 

1808 if compositeStorageClass is not None: 1808 ↛ 1811line 1808 didn't jump to line 1811, because the condition on line 1808 was never false

1809 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1810 

1811 if isDisassembled and not refComponent: 

1812 # This was a disassembled dataset spread over multiple files 

1813 # and we need to put them all back together again. 

1814 # Read into memory and then assemble 

1815 

1816 # Check that the supplied parameters are suitable for the type read 

1817 refStorageClass.validateParameters(parameters) 

1818 

1819 # We want to keep track of all the parameters that were not used 

1820 # by formatters. We assume that if any of the component formatters 

1821 # use a parameter that we do not need to apply it again in the 

1822 # assembler. 

1823 usedParams = set() 

1824 

1825 components: Dict[str, Any] = {} 

1826 for getInfo in allGetInfo: 

1827 # assemblerParams are parameters not understood by the 

1828 # associated formatter. 

1829 usedParams.update(set(getInfo.formatterParams)) 

1830 

1831 component = getInfo.component 

1832 

1833 if component is None: 1833 ↛ 1834line 1833 didn't jump to line 1834, because the condition on line 1833 was never true

1834 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1835 

1836 # We do not want the formatter to think it's reading 

1837 # a component though because it is really reading a 

1838 # standalone dataset -- always tell reader it is not a 

1839 # component. 

1840 components[component] = self._read_artifact_into_memory( 

1841 getInfo, ref.makeComponentRef(component), isComponent=False 

1842 ) 

1843 

1844 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1845 

1846 # Any unused parameters will have to be passed to the assembler 

1847 if parameters: 

1848 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1849 else: 

1850 unusedParams = {} 

1851 

1852 # Process parameters 

1853 return ref.datasetType.storageClass.delegate().handleParameters( 

1854 inMemoryDataset, parameters=unusedParams 

1855 ) 

1856 

1857 elif isDisassembledReadOnlyComponent: 

1858 

1859 compositeStorageClass = ref.datasetType.parentStorageClass 

1860 if compositeStorageClass is None: 1860 ↛ 1861line 1860 didn't jump to line 1861, because the condition on line 1860 was never true

1861 raise RuntimeError( 

1862 f"Unable to retrieve derived component '{refComponent}' since" 

1863 "no composite storage class is available." 

1864 ) 

1865 

1866 if refComponent is None: 1866 ↛ 1868line 1866 didn't jump to line 1868, because the condition on line 1866 was never true

1867 # Mainly for mypy 

1868 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1869 

1870 # Assume that every derived component can be calculated by 

1871 # forwarding the request to a single read/write component. 

1872 # Rather than guessing which rw component is the right one by 

1873 # scanning each for a derived component of the same name, 

1874 # we ask the storage class delegate directly which one is best to 

1875 # use. 

1876 compositeDelegate = compositeStorageClass.delegate() 

1877 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

1878 refComponent, set(allComponents) 

1879 ) 

1880 

1881 # Select the relevant component 

1882 rwInfo = allComponents[forwardedComponent] 

1883 

1884 # For now assume that read parameters are validated against 

1885 # the real component and not the requested component 

1886 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1887 forwardedStorageClass.validateParameters(parameters) 

1888 

1889 # The reference to use for the caching must refer to the forwarded 

1890 # component and not the derived component. 

1891 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1892 

1893 # Unfortunately the FileDescriptor inside the formatter will have 

1894 # the wrong write storage class so we need to create a new one 

1895 # given the immutability constraint. 

1896 writeStorageClass = rwInfo.info.storageClass 

1897 

1898 # We may need to put some thought into parameters for read 

1899 # components but for now forward them on as is 

1900 readFormatter = type(rwInfo.formatter)( 

1901 FileDescriptor( 

1902 rwInfo.location, 

1903 readStorageClass=refStorageClass, 

1904 storageClass=writeStorageClass, 

1905 parameters=parameters, 

1906 ), 

1907 ref.dataId, 

1908 ) 

1909 

1910 # The assembler can not receive any parameter requests for a 

1911 # derived component at this time since the assembler will 

1912 # see the storage class of the derived component and those 

1913 # parameters will have to be handled by the formatter on the 

1914 # forwarded storage class. 

1915 assemblerParams: Dict[str, Any] = {} 

1916 

1917 # Need to created a new info that specifies the derived 

1918 # component and associated storage class 

1919 readInfo = DatastoreFileGetInformation( 

1920 rwInfo.location, 

1921 readFormatter, 

1922 rwInfo.info, 

1923 assemblerParams, 

1924 {}, 

1925 refComponent, 

1926 refStorageClass, 

1927 ) 

1928 

1929 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

1930 

1931 else: 

1932 # Single file request or component from that composite file 

1933 for lookup in (refComponent, None): 1933 ↛ 1938line 1933 didn't jump to line 1938, because the loop on line 1933 didn't complete

1934 if lookup in allComponents: 1934 ↛ 1933line 1934 didn't jump to line 1933, because the condition on line 1934 was never false

1935 getInfo = allComponents[lookup] 

1936 break 

1937 else: 

1938 raise FileNotFoundError( 

1939 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

1940 ) 

1941 

1942 # Do not need the component itself if already disassembled 

1943 if isDisassembled: 

1944 isComponent = False 

1945 else: 

1946 isComponent = getInfo.component is not None 

1947 

1948 # For a component read of a composite we want the cache to 

1949 # be looking at the composite ref itself. 

1950 cache_ref = ref.makeCompositeRef() if isComponent else ref 

1951 

1952 # For a disassembled component we can validate parametersagainst 

1953 # the component storage class directly 

1954 if isDisassembled: 

1955 refStorageClass.validateParameters(parameters) 

1956 else: 

1957 # For an assembled composite this could be a derived 

1958 # component derived from a real component. The validity 

1959 # of the parameters is not clear. For now validate against 

1960 # the composite storage class 

1961 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1962 

1963 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

1964 

1965 @transactional 

1966 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1967 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1968 

1969 Parameters 

1970 ---------- 

1971 inMemoryDataset : `object` 

1972 The dataset to store. 

1973 ref : `DatasetRef` 

1974 Reference to the associated Dataset. 

1975 

1976 Raises 

1977 ------ 

1978 TypeError 

1979 Supplied object and storage class are inconsistent. 

1980 DatasetTypeNotSupportedError 

1981 The associated `DatasetType` is not handled by this datastore. 

1982 

1983 Notes 

1984 ----- 

1985 If the datastore is configured to reject certain dataset types it 

1986 is possible that the put will fail and raise a 

1987 `DatasetTypeNotSupportedError`. The main use case for this is to 

1988 allow `ChainedDatastore` to put to multiple datastores without 

1989 requiring that every datastore accepts the dataset. 

1990 """ 

1991 

1992 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1993 # doDisassembly = True 

1994 

1995 artifacts = [] 

1996 if doDisassembly: 

1997 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1998 for component, componentInfo in components.items(): 

1999 # Don't recurse because we want to take advantage of 

2000 # bulk insert -- need a new DatasetRef that refers to the 

2001 # same dataset_id but has the component DatasetType 

2002 # DatasetType does not refer to the types of components 

2003 # So we construct one ourselves. 

2004 compRef = ref.makeComponentRef(component) 

2005 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2006 artifacts.append((compRef, storedInfo)) 

2007 else: 

2008 # Write the entire thing out 

2009 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2010 artifacts.append((ref, storedInfo)) 

2011 

2012 self._register_datasets(artifacts) 

2013 

2014 @transactional 

2015 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2016 # At this point can safely remove these datasets from the cache 

2017 # to avoid confusion later on. If they are not trashed later 

2018 # the cache will simply be refilled. 

2019 self.cacheManager.remove_from_cache(ref) 

2020 

2021 # If we are in trust mode there will be nothing to move to 

2022 # the trash table and we will have to try to delete the file 

2023 # immediately. 

2024 if self.trustGetRequest: 

2025 # Try to keep the logic below for a single file trash. 

2026 if isinstance(ref, DatasetRef): 

2027 refs = {ref} 

2028 else: 

2029 # Will recreate ref at the end of this branch. 

2030 refs = set(ref) 

2031 

2032 # Determine which datasets are known to datastore directly. 

2033 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2034 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2035 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2036 

2037 missing = refs - existing_refs 

2038 if missing: 

2039 # Do an explicit existence check on these refs. 

2040 # We only care about the artifacts at this point and not 

2041 # the dataset existence. 

2042 artifact_existence: Dict[ResourcePath, bool] = {} 

2043 _ = self.mexists(missing, artifact_existence) 

2044 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2045 

2046 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2047 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2048 for uri in uris: 

2049 try: 

2050 uri.remove() 

2051 except Exception as e: 

2052 if ignore_errors: 

2053 log.debug("Artifact %s could not be removed: %s", uri, e) 

2054 continue 

2055 raise 

2056 

2057 # There is no point asking the code below to remove refs we 

2058 # know are missing so update it with the list of existing 

2059 # records. Try to retain one vs many logic. 

2060 if not existing_refs: 

2061 # Nothing more to do since none of the datasets were 

2062 # known to the datastore record table. 

2063 return 

2064 ref = list(existing_refs) 

2065 if len(ref) == 1: 

2066 ref = ref[0] 

2067 

2068 # Get file metadata and internal metadata 

2069 if not isinstance(ref, DatasetRef): 

2070 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2071 # Assumed to be an iterable of refs so bulk mode enabled. 

2072 try: 

2073 self.bridge.moveToTrash(ref) 

2074 except Exception as e: 

2075 if ignore_errors: 

2076 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2077 else: 

2078 raise 

2079 return 

2080 

2081 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2082 

2083 fileLocations = self._get_dataset_locations_info(ref) 

2084 

2085 if not fileLocations: 

2086 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2087 if ignore_errors: 

2088 log.warning(err_msg) 

2089 return 

2090 else: 

2091 raise FileNotFoundError(err_msg) 

2092 

2093 for location, storedFileInfo in fileLocations: 

2094 if not self._artifact_exists(location): 2094 ↛ 2095line 2094 didn't jump to line 2095

2095 err_msg = ( 

2096 f"Dataset is known to datastore {self.name} but " 

2097 f"associated artifact ({location.uri}) is missing" 

2098 ) 

2099 if ignore_errors: 

2100 log.warning(err_msg) 

2101 return 

2102 else: 

2103 raise FileNotFoundError(err_msg) 

2104 

2105 # Mark dataset as trashed 

2106 try: 

2107 self.bridge.moveToTrash([ref]) 

2108 except Exception as e: 

2109 if ignore_errors: 

2110 log.warning( 

2111 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2112 "but encountered an error: %s", 

2113 ref, 

2114 self.name, 

2115 e, 

2116 ) 

2117 pass 

2118 else: 

2119 raise 

2120 

2121 @transactional 

2122 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2123 """Remove all datasets from the trash. 

2124 

2125 Parameters 

2126 ---------- 

2127 ignore_errors : `bool` 

2128 If `True` return without error even if something went wrong. 

2129 Problems could occur if another process is simultaneously trying 

2130 to delete. 

2131 """ 

2132 log.debug("Emptying trash in datastore %s", self.name) 

2133 

2134 # Context manager will empty trash iff we finish it without raising. 

2135 # It will also automatically delete the relevant rows from the 

2136 # trash table and the records table. 

2137 with self.bridge.emptyTrash( 

2138 self._table, record_class=StoredFileInfo, record_column="path" 

2139 ) as trash_data: 

2140 # Removing the artifacts themselves requires that the files are 

2141 # not also associated with refs that are not to be trashed. 

2142 # Therefore need to do a query with the file paths themselves 

2143 # and return all the refs associated with them. Can only delete 

2144 # a file if the refs to be trashed are the only refs associated 

2145 # with the file. 

2146 # This requires multiple copies of the trashed items 

2147 trashed, artifacts_to_keep = trash_data 

2148 

2149 if artifacts_to_keep is None: 

2150 # The bridge is not helping us so have to work it out 

2151 # ourselves. This is not going to be as efficient. 

2152 trashed = list(trashed) 

2153 

2154 # The instance check is for mypy since up to this point it 

2155 # does not know the type of info. 

2156 path_map = self._refs_associated_with_artifacts( 

2157 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2158 ) 

2159 

2160 for ref, info in trashed: 

2161 

2162 # Mypy needs to know this is not the base class 

2163 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2164 

2165 # Check for mypy 

2166 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2167 

2168 path_map[info.path].remove(ref.id) 

2169 if not path_map[info.path]: 2169 ↛ 2160line 2169 didn't jump to line 2160, because the condition on line 2169 was never false

2170 del path_map[info.path] 

2171 

2172 artifacts_to_keep = set(path_map) 

2173 

2174 for ref, info in trashed: 

2175 

2176 # Should not happen for this implementation but need 

2177 # to keep mypy happy. 

2178 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2179 

2180 # Mypy needs to know this is not the base class 

2181 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2182 

2183 # Check for mypy 

2184 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2185 

2186 if info.path in artifacts_to_keep: 

2187 # This is a multi-dataset artifact and we are not 

2188 # removing all associated refs. 

2189 continue 

2190 

2191 # Only trashed refs still known to datastore will be returned. 

2192 location = info.file_location(self.locationFactory) 

2193 

2194 # Point of no return for this artifact 

2195 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2196 try: 

2197 self._delete_artifact(location) 

2198 except FileNotFoundError: 

2199 # If the file itself has been deleted there is nothing 

2200 # we can do about it. It is possible that trash has 

2201 # been run in parallel in another process or someone 

2202 # decided to delete the file. It is unlikely to come 

2203 # back and so we should still continue with the removal 

2204 # of the entry from the trash table. It is also possible 

2205 # we removed it in a previous iteration if it was 

2206 # a multi-dataset artifact. The delete artifact method 

2207 # will log a debug message in this scenario. 

2208 # Distinguishing file missing before trash started and 

2209 # file already removed previously as part of this trash 

2210 # is not worth the distinction with regards to potential 

2211 # memory cost. 

2212 pass 

2213 except Exception as e: 

2214 if ignore_errors: 

2215 # Use a debug message here even though it's not 

2216 # a good situation. In some cases this can be 

2217 # caused by a race between user A and user B 

2218 # and neither of them has permissions for the 

2219 # other's files. Butler does not know about users 

2220 # and trash has no idea what collections these 

2221 # files were in (without guessing from a path). 

2222 log.debug( 

2223 "Encountered error removing artifact %s from datastore %s: %s", 

2224 location.uri, 

2225 self.name, 

2226 e, 

2227 ) 

2228 else: 

2229 raise 

2230 

2231 @transactional 

2232 def transfer_from( 

2233 self, 

2234 source_datastore: Datastore, 

2235 refs: Iterable[DatasetRef], 

2236 local_refs: Optional[Iterable[DatasetRef]] = None, 

2237 transfer: str = "auto", 

2238 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2239 ) -> None: 

2240 # Docstring inherited 

2241 if type(self) is not type(source_datastore): 

2242 raise TypeError( 

2243 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2244 f"source datastore ({type(source_datastore)})." 

2245 ) 

2246 

2247 # Be explicit for mypy 

2248 if not isinstance(source_datastore, FileDatastore): 2248 ↛ 2249line 2248 didn't jump to line 2249, because the condition on line 2248 was never true

2249 raise TypeError( 

2250 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2251 f" {type(source_datastore)}" 

2252 ) 

2253 

2254 # Stop early if "direct" transfer mode is requested. That would 

2255 # require that the URI inside the source datastore should be stored 

2256 # directly in the target datastore, which seems unlikely to be useful 

2257 # since at any moment the source datastore could delete the file. 

2258 if transfer in ("direct", "split"): 

2259 raise ValueError( 

2260 f"Can not transfer from a source datastore using {transfer} mode since" 

2261 " those files are controlled by the other datastore." 

2262 ) 

2263 

2264 # Empty existence lookup if none given. 

2265 if artifact_existence is None: 

2266 artifact_existence = {} 

2267 

2268 # We will go through the list multiple times so must convert 

2269 # generators to lists. 

2270 refs = list(refs) 

2271 

2272 if local_refs is None: 

2273 local_refs = refs 

2274 else: 

2275 local_refs = list(local_refs) 

2276 

2277 # In order to handle disassembled composites the code works 

2278 # at the records level since it can assume that internal APIs 

2279 # can be used. 

2280 # - If the record already exists in the destination this is assumed 

2281 # to be okay. 

2282 # - If there is no record but the source and destination URIs are 

2283 # identical no transfer is done but the record is added. 

2284 # - If the source record refers to an absolute URI currently assume 

2285 # that that URI should remain absolute and will be visible to the 

2286 # destination butler. May need to have a flag to indicate whether 

2287 # the dataset should be transferred. This will only happen if 

2288 # the detached Butler has had a local ingest. 

2289 

2290 # What we really want is all the records in the source datastore 

2291 # associated with these refs. Or derived ones if they don't exist 

2292 # in the source. 

2293 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2294 

2295 # The source dataset_ids are the keys in these records 

2296 source_ids = set(source_records) 

2297 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2298 

2299 # The not None check is to appease mypy 

2300 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2301 missing_ids = requested_ids - source_ids 

2302 

2303 # Missing IDs can be okay if that datastore has allowed 

2304 # gets based on file existence. Should we transfer what we can 

2305 # or complain about it and warn? 

2306 if missing_ids and not source_datastore.trustGetRequest: 2306 ↛ 2307line 2306 didn't jump to line 2307, because the condition on line 2306 was never true

2307 raise ValueError( 

2308 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2309 ) 

2310 

2311 # Need to map these missing IDs to a DatasetRef so we can guess 

2312 # the details. 

2313 if missing_ids: 

2314 log.info( 

2315 "Number of expected datasets missing from source datastore records: %d out of %d", 

2316 len(missing_ids), 

2317 len(requested_ids), 

2318 ) 

2319 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2320 

2321 # This should be chunked in case we end up having to check 

2322 # the file store since we need some log output to show 

2323 # progress. 

2324 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2325 records = {} 

2326 for missing in missing_ids_chunk: 

2327 # Ask the source datastore where the missing artifacts 

2328 # should be. An execution butler might not know about the 

2329 # artifacts even if they are there. 

2330 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2331 records[missing] = [info for _, info in expected] 

2332 

2333 # Call the mexist helper method in case we have not already 

2334 # checked these artifacts such that artifact_existence is 

2335 # empty. This allows us to benefit from parallelism. 

2336 # datastore.mexists() itself does not give us access to the 

2337 # derived datastore record. 

2338 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2339 ref_exists = source_datastore._process_mexists_records( 

2340 id_to_ref, records, False, artifact_existence=artifact_existence 

2341 ) 

2342 

2343 # Now go through the records and propagate the ones that exist. 

2344 location_factory = source_datastore.locationFactory 

2345 for missing, record_list in records.items(): 

2346 # Skip completely if the ref does not exist. 

2347 ref = id_to_ref[missing] 

2348 if not ref_exists[ref]: 

2349 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2350 continue 

2351 # Check for file artifact to decide which parts of a 

2352 # disassembled composite do exist. If there is only a 

2353 # single record we don't even need to look because it can't 

2354 # be a composite and must exist. 

2355 if len(record_list) == 1: 

2356 dataset_records = record_list 

2357 else: 

2358 dataset_records = [ 

2359 record 

2360 for record in record_list 

2361 if artifact_existence[record.file_location(location_factory).uri] 

2362 ] 

2363 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2364 

2365 # Rely on source_records being a defaultdict. 

2366 source_records[missing].extend(dataset_records) 

2367 

2368 # See if we already have these records 

2369 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2370 

2371 # The artifacts to register 

2372 artifacts = [] 

2373 

2374 # Refs that already exist 

2375 already_present = [] 

2376 

2377 # Now can transfer the artifacts 

2378 for source_ref, target_ref in zip(refs, local_refs): 

2379 if target_ref.id in target_records: 

2380 # Already have an artifact for this. 

2381 already_present.append(target_ref) 

2382 continue 

2383 

2384 # mypy needs to know these are always resolved refs 

2385 for info in source_records[source_ref.getCheckedId()]: 

2386 source_location = info.file_location(source_datastore.locationFactory) 

2387 target_location = info.file_location(self.locationFactory) 

2388 if source_location == target_location: 2388 ↛ 2392line 2388 didn't jump to line 2392, because the condition on line 2388 was never true

2389 # Either the dataset is already in the target datastore 

2390 # (which is how execution butler currently runs) or 

2391 # it is an absolute URI. 

2392 if source_location.pathInStore.isabs(): 

2393 # Just because we can see the artifact when running 

2394 # the transfer doesn't mean it will be generally 

2395 # accessible to a user of this butler. For now warn 

2396 # but assume it will be accessible. 

2397 log.warning( 

2398 "Transfer request for an outside-datastore artifact has been found at %s", 

2399 source_location, 

2400 ) 

2401 else: 

2402 # Need to transfer it to the new location. 

2403 # Assume we should always overwrite. If the artifact 

2404 # is there this might indicate that a previous transfer 

2405 # was interrupted but was not able to be rolled back 

2406 # completely (eg pre-emption) so follow Datastore default 

2407 # and overwrite. 

2408 target_location.uri.transfer_from( 

2409 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2410 ) 

2411 

2412 artifacts.append((target_ref, info)) 

2413 

2414 self._register_datasets(artifacts) 

2415 

2416 if already_present: 

2417 n_skipped = len(already_present) 

2418 log.info( 

2419 "Skipped transfer of %d dataset%s already present in datastore", 

2420 n_skipped, 

2421 "" if n_skipped == 1 else "s", 

2422 ) 

2423 

2424 @transactional 

2425 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2426 # Docstring inherited. 

2427 refs = list(refs) 

2428 self.bridge.forget(refs) 

2429 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2430 

2431 def validateConfiguration( 

2432 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2433 ) -> None: 

2434 """Validate some of the configuration for this datastore. 

2435 

2436 Parameters 

2437 ---------- 

2438 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2439 Entities to test against this configuration. Can be differing 

2440 types. 

2441 logFailures : `bool`, optional 

2442 If `True`, output a log message for every validation error 

2443 detected. 

2444 

2445 Raises 

2446 ------ 

2447 DatastoreValidationError 

2448 Raised if there is a validation problem with a configuration. 

2449 All the problems are reported in a single exception. 

2450 

2451 Notes 

2452 ----- 

2453 This method checks that all the supplied entities have valid file 

2454 templates and also have formatters defined. 

2455 """ 

2456 

2457 templateFailed = None 

2458 try: 

2459 self.templates.validateTemplates(entities, logFailures=logFailures) 

2460 except FileTemplateValidationError as e: 

2461 templateFailed = str(e) 

2462 

2463 formatterFailed = [] 

2464 for entity in entities: 

2465 try: 

2466 self.formatterFactory.getFormatterClass(entity) 

2467 except KeyError as e: 

2468 formatterFailed.append(str(e)) 

2469 if logFailures: 2469 ↛ 2464line 2469 didn't jump to line 2464, because the condition on line 2469 was never false

2470 log.critical("Formatter failure: %s", e) 

2471 

2472 if templateFailed or formatterFailed: 

2473 messages = [] 

2474 if templateFailed: 2474 ↛ 2475line 2474 didn't jump to line 2475, because the condition on line 2474 was never true

2475 messages.append(templateFailed) 

2476 if formatterFailed: 2476 ↛ 2478line 2476 didn't jump to line 2478, because the condition on line 2476 was never false

2477 messages.append(",".join(formatterFailed)) 

2478 msg = ";\n".join(messages) 

2479 raise DatastoreValidationError(msg) 

2480 

2481 def getLookupKeys(self) -> Set[LookupKey]: 

2482 # Docstring is inherited from base class 

2483 return ( 

2484 self.templates.getLookupKeys() 

2485 | self.formatterFactory.getLookupKeys() 

2486 | self.constraints.getLookupKeys() 

2487 ) 

2488 

2489 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2490 # Docstring is inherited from base class 

2491 # The key can be valid in either formatters or templates so we can 

2492 # only check the template if it exists 

2493 if lookupKey in self.templates: 

2494 try: 

2495 self.templates[lookupKey].validateTemplate(entity) 

2496 except FileTemplateValidationError as e: 

2497 raise DatastoreValidationError(e) from e 

2498 

2499 def export( 

2500 self, 

2501 refs: Iterable[DatasetRef], 

2502 *, 

2503 directory: Optional[ResourcePathExpression] = None, 

2504 transfer: Optional[str] = "auto", 

2505 ) -> Iterable[FileDataset]: 

2506 # Docstring inherited from Datastore.export. 

2507 if transfer is not None and directory is None: 2507 ↛ 2508line 2507 didn't jump to line 2508, because the condition on line 2507 was never true

2508 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2509 

2510 # Force the directory to be a URI object 

2511 directoryUri: Optional[ResourcePath] = None 

2512 if directory is not None: 2512 ↛ 2515line 2512 didn't jump to line 2515, because the condition on line 2512 was never false

2513 directoryUri = ResourcePath(directory, forceDirectory=True) 

2514 

2515 if transfer is not None and directoryUri is not None: 2515 ↛ 2520line 2515 didn't jump to line 2520, because the condition on line 2515 was never false

2516 # mypy needs the second test 

2517 if not directoryUri.exists(): 2517 ↛ 2518line 2517 didn't jump to line 2518, because the condition on line 2517 was never true

2518 raise FileNotFoundError(f"Export location {directory} does not exist") 

2519 

2520 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2521 for ref in progress.wrap(refs, "Exporting dataset files"): 

2522 fileLocations = self._get_dataset_locations_info(ref) 

2523 if not fileLocations: 2523 ↛ 2524line 2523 didn't jump to line 2524, because the condition on line 2523 was never true

2524 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2525 # For now we can not export disassembled datasets 

2526 if len(fileLocations) > 1: 

2527 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2528 location, storedFileInfo = fileLocations[0] 

2529 

2530 pathInStore = location.pathInStore.path 

2531 if transfer is None: 2531 ↛ 2535line 2531 didn't jump to line 2535, because the condition on line 2531 was never true

2532 # TODO: do we also need to return the readStorageClass somehow? 

2533 # We will use the path in store directly. If this is an 

2534 # absolute URI, preserve it. 

2535 if location.pathInStore.isabs(): 

2536 pathInStore = str(location.uri) 

2537 elif transfer == "direct": 2537 ↛ 2539line 2537 didn't jump to line 2539, because the condition on line 2537 was never true

2538 # Use full URIs to the remote store in the export 

2539 pathInStore = str(location.uri) 

2540 else: 

2541 # mypy needs help 

2542 assert directoryUri is not None, "directoryUri must be defined to get here" 

2543 storeUri = ResourcePath(location.uri) 

2544 

2545 # if the datastore has an absolute URI to a resource, we 

2546 # have two options: 

2547 # 1. Keep the absolute URI in the exported YAML 

2548 # 2. Allocate a new name in the local datastore and transfer 

2549 # it. 

2550 # For now go with option 2 

2551 if location.pathInStore.isabs(): 2551 ↛ 2552line 2551 didn't jump to line 2552, because the condition on line 2551 was never true

2552 template = self.templates.getTemplate(ref) 

2553 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2554 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2555 

2556 exportUri = directoryUri.join(pathInStore) 

2557 exportUri.transfer_from(storeUri, transfer=transfer) 

2558 

2559 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2560 

2561 @staticmethod 

2562 def computeChecksum( 

2563 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2564 ) -> Optional[str]: 

2565 """Compute the checksum of the supplied file. 

2566 

2567 Parameters 

2568 ---------- 

2569 uri : `lsst.resources.ResourcePath` 

2570 Name of resource to calculate checksum from. 

2571 algorithm : `str`, optional 

2572 Name of algorithm to use. Must be one of the algorithms supported 

2573 by :py:class`hashlib`. 

2574 block_size : `int` 

2575 Number of bytes to read from file at one time. 

2576 

2577 Returns 

2578 ------- 

2579 hexdigest : `str` 

2580 Hex digest of the file. 

2581 

2582 Notes 

2583 ----- 

2584 Currently returns None if the URI is for a remote resource. 

2585 """ 

2586 if algorithm not in hashlib.algorithms_guaranteed: 2586 ↛ 2587line 2586 didn't jump to line 2587, because the condition on line 2586 was never true

2587 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2588 

2589 if not uri.isLocal: 2589 ↛ 2590line 2589 didn't jump to line 2590, because the condition on line 2589 was never true

2590 return None 

2591 

2592 hasher = hashlib.new(algorithm) 

2593 

2594 with uri.as_local() as local_uri: 

2595 with open(local_uri.ospath, "rb") as f: 

2596 for chunk in iter(lambda: f.read(block_size), b""): 

2597 hasher.update(chunk) 

2598 

2599 return hasher.hexdigest() 

2600 

2601 def needs_expanded_data_ids( 

2602 self, 

2603 transfer: Optional[str], 

2604 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2605 ) -> bool: 

2606 # Docstring inherited. 

2607 # This _could_ also use entity to inspect whether the filename template 

2608 # involves placeholders other than the required dimensions for its 

2609 # dataset type, but that's not necessary for correctness; it just 

2610 # enables more optimizations (perhaps only in theory). 

2611 return transfer not in ("direct", None)