Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 84%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

844 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Type, 

43 Union, 

44) 

45 

46from lsst.daf.butler import ( 

47 CompositesMap, 

48 Config, 

49 DatasetId, 

50 DatasetRef, 

51 DatasetType, 

52 DatasetTypeNotSupportedError, 

53 Datastore, 

54 DatastoreCacheManager, 

55 DatastoreConfig, 

56 DatastoreDisabledCacheManager, 

57 DatastoreValidationError, 

58 FileDataset, 

59 FileDescriptor, 

60 FileTemplates, 

61 FileTemplateValidationError, 

62 Formatter, 

63 FormatterFactory, 

64 Location, 

65 LocationFactory, 

66 Progress, 

67 StorageClass, 

68 StoredFileInfo, 

69 ddl, 

70) 

71from lsst.daf.butler.core.repoRelocation import replaceRoot 

72from lsst.daf.butler.core.utils import transactional 

73from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

74from lsst.resources import ResourcePath, ResourcePathExpression 

75from lsst.utils.introspection import get_class_of, get_instance_of 

76from lsst.utils.iteration import chunk_iterable 

77 

78# For VERBOSE logging usage. 

79from lsst.utils.logging import VERBOSE, getLogger 

80from lsst.utils.timer import time_this 

81from sqlalchemy import BigInteger, String 

82 

83from .genericDatastore import GenericBaseDatastore 

84 

85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

88 

89log = getLogger(__name__) 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 

101 def __init__(self, datasets: List[FileDataset]): 

102 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

103 self.datasets = datasets 

104 

105 

106@dataclass(frozen=True) 

107class DatastoreFileGetInformation: 

108 """Collection of useful parameters needed to retrieve a file from 

109 a Datastore. 

110 """ 

111 

112 location: Location 

113 """The location from which to read the dataset.""" 

114 

115 formatter: Formatter 

116 """The `Formatter` to use to deserialize the dataset.""" 

117 

118 info: StoredFileInfo 

119 """Stored information about this file and its formatter.""" 

120 

121 assemblerParams: Dict[str, Any] 

122 """Parameters to use for post-processing the retrieved dataset.""" 

123 

124 formatterParams: Dict[str, Any] 

125 """Parameters that were understood by the associated formatter.""" 

126 

127 component: Optional[str] 

128 """The component to be retrieved (can be `None`).""" 

129 

130 readStorageClass: StorageClass 

131 """The `StorageClass` of the dataset being read.""" 

132 

133 

134class FileDatastore(GenericBaseDatastore): 

135 """Generic Datastore for file-based implementations. 

136 

137 Should always be sub-classed since key abstract methods are missing. 

138 

139 Parameters 

140 ---------- 

141 config : `DatastoreConfig` or `str` 

142 Configuration as either a `Config` object or URI to file. 

143 bridgeManager : `DatastoreRegistryBridgeManager` 

144 Object that manages the interface between `Registry` and datastores. 

145 butlerRoot : `str`, optional 

146 New datastore root to use to override the configuration value. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 If root location does not exist and ``create`` is `False` in the 

152 configuration. 

153 """ 

154 

155 defaultConfigFile: ClassVar[Optional[str]] = None 

156 """Path to configuration defaults. Accessed within the ``config`` resource 

157 or relative to a search path. Can be None if no defaults specified. 

158 """ 

159 

160 root: ResourcePath 

161 """Root directory URI of this `Datastore`.""" 

162 

163 locationFactory: LocationFactory 

164 """Factory for creating locations relative to the datastore root.""" 

165 

166 formatterFactory: FormatterFactory 

167 """Factory for creating instances of formatters.""" 

168 

169 templates: FileTemplates 

170 """File templates that can be used by this `Datastore`.""" 

171 

172 composites: CompositesMap 

173 """Determines whether a dataset should be disassembled on put.""" 

174 

175 defaultConfigFile = "datastores/fileDatastore.yaml" 

176 """Path to configuration defaults. Accessed within the ``config`` resource 

177 or relative to a search path. Can be None if no defaults specified. 

178 """ 

179 

180 @classmethod 

181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

182 """Set any filesystem-dependent config options for this Datastore to 

183 be appropriate for a new empty repository with the given root. 

184 

185 Parameters 

186 ---------- 

187 root : `str` 

188 URI to the root of the data repository. 

189 config : `Config` 

190 A `Config` to update. Only the subset understood by 

191 this component will be updated. Will not expand 

192 defaults. 

193 full : `Config` 

194 A complete config with all defaults expanded that can be 

195 converted to a `DatastoreConfig`. Read-only and will not be 

196 modified by this method. 

197 Repository-specific options that should not be obtained 

198 from defaults when Butler instances are constructed 

199 should be copied from ``full`` to ``config``. 

200 overwrite : `bool`, optional 

201 If `False`, do not modify a value in ``config`` if the value 

202 already exists. Default is always to overwrite with the provided 

203 ``root``. 

204 

205 Notes 

206 ----- 

207 If a keyword is explicitly defined in the supplied ``config`` it 

208 will not be overridden by this method if ``overwrite`` is `False`. 

209 This allows explicit values set in external configs to be retained. 

210 """ 

211 Config.updateParameters( 

212 DatastoreConfig, 

213 config, 

214 full, 

215 toUpdate={"root": root}, 

216 toCopy=("cls", ("records", "table")), 

217 overwrite=overwrite, 

218 ) 

219 

220 @classmethod 

221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

222 return ddl.TableSpec( 

223 fields=[ 

224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

228 # Use empty string to indicate no component 

229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

230 # TODO: should checksum be Base64Bytes instead? 

231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

233 ], 

234 unique=frozenset(), 

235 indexes=[tuple(["path"])], 

236 ) 

237 

238 def __init__( 

239 self, 

240 config: Union[DatastoreConfig, str], 

241 bridgeManager: DatastoreRegistryBridgeManager, 

242 butlerRoot: str = None, 

243 ): 

244 super().__init__(config, bridgeManager) 

245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 raise ValueError("No root directory specified in configuration") 

247 

248 # Name ourselves either using an explicit name or a name 

249 # derived from the (unexpanded) root 

250 if "name" in self.config: 

251 self.name = self.config["name"] 

252 else: 

253 # We use the unexpanded root in the name to indicate that this 

254 # datastore can be moved without having to update registry. 

255 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

256 

257 # Support repository relocation in config 

258 # Existence of self.root is checked in subclass 

259 self.root = ResourcePath( 

260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

261 ) 

262 

263 self.locationFactory = LocationFactory(self.root) 

264 self.formatterFactory = FormatterFactory() 

265 

266 # Now associate formatters with storage classes 

267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

268 

269 # Read the file naming templates 

270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

271 

272 # See if composites should be disassembled 

273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

274 

275 tableName = self.config["records", "table"] 

276 try: 

277 # Storage of paths and formatters, keyed by dataset_id 

278 self._table = bridgeManager.opaque.register( 

279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

280 ) 

281 # Interface to Registry. 

282 self._bridge = bridgeManager.register(self.name) 

283 except ReadOnlyDatabaseError: 

284 # If the database is read only and we just tried and failed to 

285 # create a table, it means someone is trying to create a read-only 

286 # butler client for an empty repo. That should be okay, as long 

287 # as they then try to get any datasets before some other client 

288 # creates the table. Chances are they'rejust validating 

289 # configuration. 

290 pass 

291 

292 # Determine whether checksums should be used - default to False 

293 self.useChecksum = self.config.get("checksum", False) 

294 

295 # Determine whether we can fall back to configuration if a 

296 # requested dataset is not known to registry 

297 self.trustGetRequest = self.config.get("trust_get_request", False) 

298 

299 # Create a cache manager 

300 self.cacheManager: AbstractDatastoreCacheManager 

301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false

302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

303 else: 

304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

305 

306 # Check existence and create directory structure if necessary 

307 if not self.root.exists(): 

308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true

309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

310 try: 

311 self.root.mkdir() 

312 except Exception as e: 

313 raise ValueError( 

314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

315 ) from e 

316 

317 def __str__(self) -> str: 

318 return str(self.root) 

319 

320 @property 

321 def bridge(self) -> DatastoreRegistryBridge: 

322 return self._bridge 

323 

324 def _artifact_exists(self, location: Location) -> bool: 

325 """Check that an artifact exists in this datastore at the specified 

326 location. 

327 

328 Parameters 

329 ---------- 

330 location : `Location` 

331 Expected location of the artifact associated with this datastore. 

332 

333 Returns 

334 ------- 

335 exists : `bool` 

336 True if the location can be found, false otherwise. 

337 """ 

338 log.debug("Checking if resource exists: %s", location.uri) 

339 return location.uri.exists() 

340 

341 def _delete_artifact(self, location: Location) -> None: 

342 """Delete the artifact from the datastore. 

343 

344 Parameters 

345 ---------- 

346 location : `Location` 

347 Location of the artifact associated with this datastore. 

348 """ 

349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true

350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

351 

352 try: 

353 location.uri.remove() 

354 except FileNotFoundError: 

355 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

356 raise 

357 except Exception as e: 

358 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

359 raise 

360 log.debug("Successfully deleted file: %s", location.uri) 

361 

362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

363 # Docstring inherited from GenericBaseDatastore 

364 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

365 self._table.insert(*records) 

366 

367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

368 # Docstring inherited from GenericBaseDatastore 

369 

370 # Look for the dataset_id -- there might be multiple matches 

371 # if we have disassembled the dataset. 

372 records = self._table.fetch(dataset_id=ref.id) 

373 return [StoredFileInfo.from_record(record) for record in records] 

374 

375 def _get_stored_records_associated_with_refs( 

376 self, refs: Iterable[DatasetIdRef] 

377 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

378 """Retrieve all records associated with the provided refs. 

379 

380 Parameters 

381 ---------- 

382 refs : iterable of `DatasetIdRef` 

383 The refs for which records are to be retrieved. 

384 

385 Returns 

386 ------- 

387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

388 The matching records indexed by the ref ID. The number of entries 

389 in the dict can be smaller than the number of requested refs. 

390 """ 

391 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

392 

393 # Uniqueness is dataset_id + component so can have multiple records 

394 # per ref. 

395 records_by_ref = defaultdict(list) 

396 for record in records: 

397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

398 return records_by_ref 

399 

400 def _refs_associated_with_artifacts( 

401 self, paths: List[Union[str, ResourcePath]] 

402 ) -> Dict[str, Set[DatasetId]]: 

403 """Return paths and associated dataset refs. 

404 

405 Parameters 

406 ---------- 

407 paths : `list` of `str` or `lsst.resources.ResourcePath` 

408 All the paths to include in search. 

409 

410 Returns 

411 ------- 

412 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

413 Mapping of each path to a set of associated database IDs. 

414 """ 

415 records = self._table.fetch(path=[str(path) for path in paths]) 

416 result = defaultdict(set) 

417 for row in records: 

418 result[row["path"]].add(row["dataset_id"]) 

419 return result 

420 

421 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

422 """Return all dataset refs associated with the supplied path. 

423 

424 Parameters 

425 ---------- 

426 pathInStore : `lsst.resources.ResourcePath` 

427 Path of interest in the data store. 

428 

429 Returns 

430 ------- 

431 ids : `set` of `int` 

432 All `DatasetRef` IDs associated with this path. 

433 """ 

434 records = list(self._table.fetch(path=str(pathInStore))) 

435 ids = {r["dataset_id"] for r in records} 

436 return ids 

437 

438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

439 # Docstring inherited from GenericBaseDatastore 

440 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

441 

442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

443 r"""Find all the `Location`\ s of the requested dataset in the 

444 `Datastore` and the associated stored file information. 

445 

446 Parameters 

447 ---------- 

448 ref : `DatasetRef` 

449 Reference to the required `Dataset`. 

450 

451 Returns 

452 ------- 

453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

454 Location of the dataset within the datastore and 

455 stored information about each file and its formatter. 

456 """ 

457 # Get the file information (this will fail if no file) 

458 records = self.getStoredItemsInfo(ref) 

459 

460 # Use the path to determine the location -- we need to take 

461 # into account absolute URIs in the datastore record 

462 return [(r.file_location(self.locationFactory), r) for r in records] 

463 

464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

465 """Check that there is only one dataset associated with the 

466 specified artifact. 

467 

468 Parameters 

469 ---------- 

470 ref : `DatasetRef` or `FakeDatasetRef` 

471 Dataset to be removed. 

472 location : `Location` 

473 The location of the artifact to be removed. 

474 

475 Returns 

476 ------- 

477 can_remove : `Bool` 

478 True if the artifact can be safely removed. 

479 """ 

480 # Can't ever delete absolute URIs. 

481 if location.pathInStore.isabs(): 

482 return False 

483 

484 # Get all entries associated with this path 

485 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

486 if not allRefs: 

487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

488 

489 # Remove these refs from all the refs and if there is nothing left 

490 # then we can delete 

491 remainingRefs = allRefs - {ref.id} 

492 

493 if remainingRefs: 

494 return False 

495 return True 

496 

497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

498 """Predict the location and related file information of the requested 

499 dataset in this datastore. 

500 

501 Parameters 

502 ---------- 

503 ref : `DatasetRef` 

504 Reference to the required `Dataset`. 

505 

506 Returns 

507 ------- 

508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

509 Expected Location of the dataset within the datastore and 

510 placeholder information about each file and its formatter. 

511 

512 Notes 

513 ----- 

514 Uses the current configuration to determine how we would expect the 

515 datastore files to have been written if we couldn't ask registry. 

516 This is safe so long as there has been no change to datastore 

517 configuration between writing the dataset and wanting to read it. 

518 Will not work for files that have been ingested without using the 

519 standard file template or default formatter. 

520 """ 

521 

522 # If we have a component ref we always need to ask the questions 

523 # of the composite. If the composite is disassembled this routine 

524 # should return all components. If the composite was not 

525 # disassembled the composite is what is stored regardless of 

526 # component request. Note that if the caller has disassembled 

527 # a composite there is no way for this guess to know that 

528 # without trying both the composite and component ref and seeing 

529 # if there is something at the component Location even without 

530 # disassembly being enabled. 

531 if ref.datasetType.isComponent(): 

532 ref = ref.makeCompositeRef() 

533 

534 # See if the ref is a composite that should be disassembled 

535 doDisassembly = self.composites.shouldBeDisassembled(ref) 

536 

537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

538 

539 if doDisassembly: 

540 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

541 compRef = ref.makeComponentRef(component) 

542 location, formatter = self._determine_put_formatter_location(compRef) 

543 all_info.append((location, formatter, componentStorage, component)) 

544 

545 else: 

546 # Always use the composite ref if no disassembly 

547 location, formatter = self._determine_put_formatter_location(ref) 

548 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

549 

550 # Convert the list of tuples to have StoredFileInfo as second element 

551 return [ 

552 ( 

553 location, 

554 StoredFileInfo( 

555 formatter=formatter, 

556 path=location.pathInStore.path, 

557 storageClass=storageClass, 

558 component=component, 

559 checksum=None, 

560 file_size=-1, 

561 ), 

562 ) 

563 for location, formatter, storageClass, component in all_info 

564 ] 

565 

566 def _prepare_for_get( 

567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

568 ) -> List[DatastoreFileGetInformation]: 

569 """Check parameters for ``get`` and obtain formatter and 

570 location. 

571 

572 Parameters 

573 ---------- 

574 ref : `DatasetRef` 

575 Reference to the required Dataset. 

576 parameters : `dict` 

577 `StorageClass`-specific parameters that specify, for example, 

578 a slice of the dataset to be loaded. 

579 

580 Returns 

581 ------- 

582 getInfo : `list` [`DatastoreFileGetInformation`] 

583 Parameters needed to retrieve each file. 

584 """ 

585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

586 

587 # Get file metadata and internal metadata 

588 fileLocations = self._get_dataset_locations_info(ref) 

589 if not fileLocations: 

590 if not self.trustGetRequest: 

591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

592 # Assume the dataset is where we think it should be 

593 fileLocations = self._get_expected_dataset_locations_info(ref) 

594 

595 # The storage class we want to use eventually 

596 refStorageClass = ref.datasetType.storageClass 

597 

598 if len(fileLocations) > 1: 

599 disassembled = True 

600 

601 # If trust is involved it is possible that there will be 

602 # components listed here that do not exist in the datastore. 

603 # Explicitly check for file artifact existence and filter out any 

604 # that are missing. 

605 if self.trustGetRequest: 

606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

607 

608 # For now complain only if we have no components at all. One 

609 # component is probably a problem but we can punt that to the 

610 # assembler. 

611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true

612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

613 

614 else: 

615 disassembled = False 

616 

617 # Is this a component request? 

618 refComponent = ref.datasetType.component() 

619 

620 fileGetInfo = [] 

621 for location, storedFileInfo in fileLocations: 

622 

623 # The storage class used to write the file 

624 writeStorageClass = storedFileInfo.storageClass 

625 

626 # If this has been disassembled we need read to match the write 

627 if disassembled: 

628 readStorageClass = writeStorageClass 

629 else: 

630 readStorageClass = refStorageClass 

631 

632 formatter = get_instance_of( 

633 storedFileInfo.formatter, 

634 FileDescriptor( 

635 location, 

636 readStorageClass=readStorageClass, 

637 storageClass=writeStorageClass, 

638 parameters=parameters, 

639 ), 

640 ref.dataId, 

641 ) 

642 

643 formatterParams, notFormatterParams = formatter.segregateParameters() 

644 

645 # Of the remaining parameters, extract the ones supported by 

646 # this StorageClass (for components not all will be handled) 

647 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

648 

649 # The ref itself could be a component if the dataset was 

650 # disassembled by butler, or we disassembled in datastore and 

651 # components came from the datastore records 

652 component = storedFileInfo.component if storedFileInfo.component else refComponent 

653 

654 fileGetInfo.append( 

655 DatastoreFileGetInformation( 

656 location, 

657 formatter, 

658 storedFileInfo, 

659 assemblerParams, 

660 formatterParams, 

661 component, 

662 readStorageClass, 

663 ) 

664 ) 

665 

666 return fileGetInfo 

667 

668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

669 """Check the arguments for ``put`` and obtain formatter and 

670 location. 

671 

672 Parameters 

673 ---------- 

674 inMemoryDataset : `object` 

675 The dataset to store. 

676 ref : `DatasetRef` 

677 Reference to the associated Dataset. 

678 

679 Returns 

680 ------- 

681 location : `Location` 

682 The location to write the dataset. 

683 formatter : `Formatter` 

684 The `Formatter` to use to write the dataset. 

685 

686 Raises 

687 ------ 

688 TypeError 

689 Supplied object and storage class are inconsistent. 

690 DatasetTypeNotSupportedError 

691 The associated `DatasetType` is not handled by this datastore. 

692 """ 

693 self._validate_put_parameters(inMemoryDataset, ref) 

694 return self._determine_put_formatter_location(ref) 

695 

696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

697 """Calculate the formatter and output location to use for put. 

698 

699 Parameters 

700 ---------- 

701 ref : `DatasetRef` 

702 Reference to the associated Dataset. 

703 

704 Returns 

705 ------- 

706 location : `Location` 

707 The location to write the dataset. 

708 formatter : `Formatter` 

709 The `Formatter` to use to write the dataset. 

710 """ 

711 # Work out output file name 

712 try: 

713 template = self.templates.getTemplate(ref) 

714 except KeyError as e: 

715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

716 

717 # Validate the template to protect against filenames from different 

718 # dataIds returning the same and causing overwrite confusion. 

719 template.validateTemplate(ref) 

720 

721 location = self.locationFactory.fromPath(template.format(ref)) 

722 

723 # Get the formatter based on the storage class 

724 storageClass = ref.datasetType.storageClass 

725 try: 

726 formatter = self.formatterFactory.getFormatter( 

727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

728 ) 

729 except KeyError as e: 

730 raise DatasetTypeNotSupportedError( 

731 f"Unable to find formatter for {ref} in datastore {self.name}" 

732 ) from e 

733 

734 # Now that we know the formatter, update the location 

735 location = formatter.makeUpdatedLocation(location) 

736 

737 return location, formatter 

738 

739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

740 # Docstring inherited from base class 

741 if transfer != "auto": 

742 return transfer 

743 

744 # See if the paths are within the datastore or not 

745 inside = [self._pathInStore(d.path) is not None for d in datasets] 

746 

747 if all(inside): 

748 transfer = None 

749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false

750 # Allow ResourcePath to use its own knowledge 

751 transfer = "auto" 

752 else: 

753 # This can happen when importing from a datastore that 

754 # has had some datasets ingested using "direct" mode. 

755 # Also allow ResourcePath to sort it out but warn about it. 

756 # This can happen if you are importing from a datastore 

757 # that had some direct transfer datasets. 

758 log.warning( 

759 "Some datasets are inside the datastore and some are outside. Using 'split' " 

760 "transfer mode. This assumes that the files outside the datastore are " 

761 "still accessible to the new butler since they will not be copied into " 

762 "the target datastore." 

763 ) 

764 transfer = "split" 

765 

766 return transfer 

767 

768 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

769 """Return path relative to datastore root 

770 

771 Parameters 

772 ---------- 

773 path : `lsst.resources.ResourcePathExpression` 

774 Path to dataset. Can be absolute URI. If relative assumed to 

775 be relative to the datastore. Returns path in datastore 

776 or raises an exception if the path it outside. 

777 

778 Returns 

779 ------- 

780 inStore : `str` 

781 Path relative to datastore root. Returns `None` if the file is 

782 outside the root. 

783 """ 

784 # Relative path will always be relative to datastore 

785 pathUri = ResourcePath(path, forceAbsolute=False) 

786 return pathUri.relative_to(self.root) 

787 

788 def _standardizeIngestPath( 

789 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None 

790 ) -> Union[str, ResourcePath]: 

791 """Standardize the path of a to-be-ingested file. 

792 

793 Parameters 

794 ---------- 

795 path : `str` or `lsst.resources.ResourcePath` 

796 Path of a file to be ingested. This parameter is not expected 

797 to be all the types that can be used to construct a 

798 `~lsst.resources.ResourcePath`. 

799 transfer : `str`, optional 

800 How (and whether) the dataset should be added to the datastore. 

801 See `ingest` for details of transfer modes. 

802 This implementation is provided only so 

803 `NotImplementedError` can be raised if the mode is not supported; 

804 actual transfers are deferred to `_extractIngestInfo`. 

805 

806 Returns 

807 ------- 

808 path : `str` or `lsst.resources.ResourcePath` 

809 New path in what the datastore considers standard form. If an 

810 absolute URI was given that will be returned unchanged. 

811 

812 Notes 

813 ----- 

814 Subclasses of `FileDatastore` can implement this method instead 

815 of `_prepIngest`. It should not modify the data repository or given 

816 file in any way. 

817 

818 Raises 

819 ------ 

820 NotImplementedError 

821 Raised if the datastore does not support the given transfer mode 

822 (including the case where ingest is not supported at all). 

823 FileNotFoundError 

824 Raised if one of the given files does not exist. 

825 """ 

826 if transfer not in (None, "direct", "split") + self.root.transferModes: 826 ↛ 827line 826 didn't jump to line 827, because the condition on line 826 was never true

827 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

828 

829 # A relative URI indicates relative to datastore root 

830 srcUri = ResourcePath(path, forceAbsolute=False) 

831 if not srcUri.isabs(): 

832 srcUri = self.root.join(path) 

833 

834 if not srcUri.exists(): 

835 raise FileNotFoundError( 

836 f"Resource at {srcUri} does not exist; note that paths to ingest " 

837 f"are assumed to be relative to {self.root} unless they are absolute." 

838 ) 

839 

840 if transfer is None: 

841 relpath = srcUri.relative_to(self.root) 

842 if not relpath: 

843 raise RuntimeError( 

844 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

845 ) 

846 

847 # Return the relative path within the datastore for internal 

848 # transfer 

849 path = relpath 

850 

851 return path 

852 

853 def _extractIngestInfo( 

854 self, 

855 path: ResourcePathExpression, 

856 ref: DatasetRef, 

857 *, 

858 formatter: Union[Formatter, Type[Formatter]], 

859 transfer: Optional[str] = None, 

860 record_validation_info: bool = True, 

861 ) -> StoredFileInfo: 

862 """Relocate (if necessary) and extract `StoredFileInfo` from a 

863 to-be-ingested file. 

864 

865 Parameters 

866 ---------- 

867 path : `lsst.resources.ResourcePathExpression` 

868 URI or path of a file to be ingested. 

869 ref : `DatasetRef` 

870 Reference for the dataset being ingested. Guaranteed to have 

871 ``dataset_id not None`. 

872 formatter : `type` or `Formatter` 

873 `Formatter` subclass to use for this dataset or an instance. 

874 transfer : `str`, optional 

875 How (and whether) the dataset should be added to the datastore. 

876 See `ingest` for details of transfer modes. 

877 record_validation_info : `bool`, optional 

878 If `True`, the default, the datastore can record validation 

879 information associated with the file. If `False` the datastore 

880 will not attempt to track any information such as checksums 

881 or file sizes. This can be useful if such information is tracked 

882 in an external system or if the file is to be compressed in place. 

883 It is up to the datastore whether this parameter is relevant. 

884 

885 Returns 

886 ------- 

887 info : `StoredFileInfo` 

888 Internal datastore record for this file. This will be inserted by 

889 the caller; the `_extractIngestInfo` is only responsible for 

890 creating and populating the struct. 

891 

892 Raises 

893 ------ 

894 FileNotFoundError 

895 Raised if one of the given files does not exist. 

896 FileExistsError 

897 Raised if transfer is not `None` but the (internal) location the 

898 file would be moved to is already occupied. 

899 """ 

900 if self._transaction is None: 900 ↛ 901line 900 didn't jump to line 901, because the condition on line 900 was never true

901 raise RuntimeError("Ingest called without transaction enabled") 

902 

903 # Create URI of the source path, do not need to force a relative 

904 # path to absolute. 

905 srcUri = ResourcePath(path, forceAbsolute=False) 

906 

907 # Track whether we have read the size of the source yet 

908 have_sized = False 

909 

910 tgtLocation: Optional[Location] 

911 if transfer is None or transfer == "split": 

912 # A relative path is assumed to be relative to the datastore 

913 # in this context 

914 if not srcUri.isabs(): 

915 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

916 else: 

917 # Work out the path in the datastore from an absolute URI 

918 # This is required to be within the datastore. 

919 pathInStore = srcUri.relative_to(self.root) 

920 if pathInStore is None and transfer is None: 920 ↛ 921line 920 didn't jump to line 921, because the condition on line 920 was never true

921 raise RuntimeError( 

922 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

923 ) 

924 if pathInStore: 924 ↛ 926line 924 didn't jump to line 926, because the condition on line 924 was never false

925 tgtLocation = self.locationFactory.fromPath(pathInStore) 

926 elif transfer == "split": 

927 # Outside the datastore but treat that as a direct ingest 

928 # instead. 

929 tgtLocation = None 

930 else: 

931 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

932 elif transfer == "direct": 932 ↛ 937line 932 didn't jump to line 937, because the condition on line 932 was never true

933 # Want to store the full URI to the resource directly in 

934 # datastore. This is useful for referring to permanent archive 

935 # storage for raw data. 

936 # Trust that people know what they are doing. 

937 tgtLocation = None 

938 else: 

939 # Work out the name we want this ingested file to have 

940 # inside the datastore 

941 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

942 if not tgtLocation.uri.dirname().exists(): 

943 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

944 tgtLocation.uri.dirname().mkdir() 

945 

946 # if we are transferring from a local file to a remote location 

947 # it may be more efficient to get the size and checksum of the 

948 # local file rather than the transferred one 

949 if record_validation_info and srcUri.isLocal: 

950 size = srcUri.size() 

951 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

952 have_sized = True 

953 

954 # Transfer the resource to the destination. 

955 # Allow overwrite of an existing file. This matches the behavior 

956 # of datastore.put() in that it trusts that registry would not 

957 # be asking to overwrite unless registry thought that the 

958 # overwrite was allowed. 

959 tgtLocation.uri.transfer_from( 

960 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

961 ) 

962 

963 if tgtLocation is None: 963 ↛ 965line 963 didn't jump to line 965, because the condition on line 963 was never true

964 # This means we are using direct mode 

965 targetUri = srcUri 

966 targetPath = str(srcUri) 

967 else: 

968 targetUri = tgtLocation.uri 

969 targetPath = tgtLocation.pathInStore.path 

970 

971 # the file should exist in the datastore now 

972 if record_validation_info: 

973 if not have_sized: 

974 size = targetUri.size() 

975 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

976 else: 

977 # Not recording any file information. 

978 size = -1 

979 checksum = None 

980 

981 return StoredFileInfo( 

982 formatter=formatter, 

983 path=targetPath, 

984 storageClass=ref.datasetType.storageClass, 

985 component=ref.datasetType.component(), 

986 file_size=size, 

987 checksum=checksum, 

988 ) 

989 

990 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

991 # Docstring inherited from Datastore._prepIngest. 

992 filtered = [] 

993 for dataset in datasets: 

994 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

995 if not acceptable: 

996 continue 

997 else: 

998 dataset.refs = acceptable 

999 if dataset.formatter is None: 

1000 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1001 else: 

1002 assert isinstance(dataset.formatter, (type, str)) 

1003 formatter_class = get_class_of(dataset.formatter) 

1004 if not issubclass(formatter_class, Formatter): 1004 ↛ 1005line 1004 didn't jump to line 1005, because the condition on line 1004 was never true

1005 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1006 dataset.formatter = formatter_class 

1007 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1008 filtered.append(dataset) 

1009 return _IngestPrepData(filtered) 

1010 

1011 @transactional 

1012 def _finishIngest( 

1013 self, 

1014 prepData: Datastore.IngestPrepData, 

1015 *, 

1016 transfer: Optional[str] = None, 

1017 record_validation_info: bool = True, 

1018 ) -> None: 

1019 # Docstring inherited from Datastore._finishIngest. 

1020 refsAndInfos = [] 

1021 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1022 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1023 # Do ingest as if the first dataset ref is associated with the file 

1024 info = self._extractIngestInfo( 

1025 dataset.path, 

1026 dataset.refs[0], 

1027 formatter=dataset.formatter, 

1028 transfer=transfer, 

1029 record_validation_info=record_validation_info, 

1030 ) 

1031 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1032 self._register_datasets(refsAndInfos) 

1033 

1034 def _calculate_ingested_datastore_name( 

1035 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1036 ) -> Location: 

1037 """Given a source URI and a DatasetRef, determine the name the 

1038 dataset will have inside datastore. 

1039 

1040 Parameters 

1041 ---------- 

1042 srcUri : `lsst.resources.ResourcePath` 

1043 URI to the source dataset file. 

1044 ref : `DatasetRef` 

1045 Ref associated with the newly-ingested dataset artifact. This 

1046 is used to determine the name within the datastore. 

1047 formatter : `Formatter` or Formatter class. 

1048 Formatter to use for validation. Can be a class or an instance. 

1049 

1050 Returns 

1051 ------- 

1052 location : `Location` 

1053 Target location for the newly-ingested dataset. 

1054 """ 

1055 # Ingesting a file from outside the datastore. 

1056 # This involves a new name. 

1057 template = self.templates.getTemplate(ref) 

1058 location = self.locationFactory.fromPath(template.format(ref)) 

1059 

1060 # Get the extension 

1061 ext = srcUri.getExtension() 

1062 

1063 # Update the destination to include that extension 

1064 location.updateExtension(ext) 

1065 

1066 # Ask the formatter to validate this extension 

1067 formatter.validateExtension(location) 

1068 

1069 return location 

1070 

1071 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1072 """Write out in memory dataset to datastore. 

1073 

1074 Parameters 

1075 ---------- 

1076 inMemoryDataset : `object` 

1077 Dataset to write to datastore. 

1078 ref : `DatasetRef` 

1079 Registry information associated with this dataset. 

1080 

1081 Returns 

1082 ------- 

1083 info : `StoredFileInfo` 

1084 Information describing the artifact written to the datastore. 

1085 """ 

1086 # May need to coerce the in memory dataset to the correct 

1087 # python type. 

1088 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1089 

1090 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1091 uri = location.uri 

1092 

1093 if not uri.dirname().exists(): 

1094 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1095 uri.dirname().mkdir() 

1096 

1097 if self._transaction is None: 1097 ↛ 1098line 1097 didn't jump to line 1098, because the condition on line 1097 was never true

1098 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1099 

1100 def _removeFileExists(uri: ResourcePath) -> None: 

1101 """Remove a file and do not complain if it is not there. 

1102 

1103 This is important since a formatter might fail before the file 

1104 is written and we should not confuse people by writing spurious 

1105 error messages to the log. 

1106 """ 

1107 try: 

1108 uri.remove() 

1109 except FileNotFoundError: 

1110 pass 

1111 

1112 # Register a callback to try to delete the uploaded data if 

1113 # something fails below 

1114 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1115 

1116 # For a local file, simply use the formatter directly 

1117 if uri.isLocal: 

1118 try: 

1119 formatter.write(inMemoryDataset) 

1120 except Exception as e: 

1121 raise RuntimeError( 

1122 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}" 

1123 ) from e 

1124 log.debug("Successfully wrote python object to local file at %s", uri) 

1125 else: 

1126 # This is a remote URI. Some datasets can be serialized directly 

1127 # to bytes and sent to the remote datastore without writing a 

1128 # file. If the dataset is intended to be saved to the cache 

1129 # a file is always written and direct write to the remote 

1130 # datastore is bypassed. 

1131 data_written = False 

1132 if not self.cacheManager.should_be_cached(ref): 

1133 try: 

1134 serializedDataset = formatter.toBytes(inMemoryDataset) 

1135 except NotImplementedError: 

1136 # Fallback to the file writing option. 

1137 pass 

1138 except Exception as e: 

1139 raise RuntimeError( 

1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1141 ) from e 

1142 else: 

1143 log.debug("Writing bytes directly to %s", uri) 

1144 uri.write(serializedDataset, overwrite=True) 

1145 log.debug("Successfully wrote bytes directly to %s", uri) 

1146 data_written = True 

1147 

1148 if not data_written: 

1149 # Did not write the bytes directly to object store so instead 

1150 # write to temporary file. 

1151 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1152 # Need to configure the formatter to write to a different 

1153 # location and that needs us to overwrite internals 

1154 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1155 with formatter._updateLocation(Location(None, temporary_uri)): 

1156 try: 

1157 formatter.write(inMemoryDataset) 

1158 except Exception as e: 

1159 raise RuntimeError( 

1160 f"Failed to serialize dataset {ref} of type" 

1161 f" {type(inMemoryDataset)} to " 

1162 f"temporary location {temporary_uri}" 

1163 ) from e 

1164 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1165 

1166 # Cache if required 

1167 self.cacheManager.move_to_cache(temporary_uri, ref) 

1168 

1169 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1170 

1171 # URI is needed to resolve what ingest case are we dealing with 

1172 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1173 

1174 def _read_artifact_into_memory( 

1175 self, 

1176 getInfo: DatastoreFileGetInformation, 

1177 ref: DatasetRef, 

1178 isComponent: bool = False, 

1179 cache_ref: Optional[DatasetRef] = None, 

1180 ) -> Any: 

1181 """Read the artifact from datastore into in memory object. 

1182 

1183 Parameters 

1184 ---------- 

1185 getInfo : `DatastoreFileGetInformation` 

1186 Information about the artifact within the datastore. 

1187 ref : `DatasetRef` 

1188 The registry information associated with this artifact. 

1189 isComponent : `bool` 

1190 Flag to indicate if a component is being read from this artifact. 

1191 cache_ref : `DatasetRef`, optional 

1192 The DatasetRef to use when looking up the file in the cache. 

1193 This ref must have the same ID as the supplied ref but can 

1194 be a parent ref or component ref to indicate to the cache whether 

1195 a composite file is being requested from the cache or a component 

1196 file. Without this the cache will default to the supplied ref but 

1197 it can get confused with read-only derived components for 

1198 disassembled composites. 

1199 

1200 Returns 

1201 ------- 

1202 inMemoryDataset : `object` 

1203 The artifact as a python object. 

1204 """ 

1205 location = getInfo.location 

1206 uri = location.uri 

1207 log.debug("Accessing data from %s", uri) 

1208 

1209 if cache_ref is None: 

1210 cache_ref = ref 

1211 if cache_ref.id != ref.id: 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true

1212 raise ValueError( 

1213 "The supplied cache dataset ref refers to a different dataset than expected:" 

1214 f" {ref.id} != {cache_ref.id}" 

1215 ) 

1216 

1217 # Cannot recalculate checksum but can compare size as a quick check 

1218 # Do not do this if the size is negative since that indicates 

1219 # we do not know. 

1220 recorded_size = getInfo.info.file_size 

1221 resource_size = uri.size() 

1222 if recorded_size >= 0 and resource_size != recorded_size: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true

1223 raise RuntimeError( 

1224 "Integrity failure in Datastore. " 

1225 f"Size of file {uri} ({resource_size}) " 

1226 f"does not match size recorded in registry of {recorded_size}" 

1227 ) 

1228 

1229 # For the general case we have choices for how to proceed. 

1230 # 1. Always use a local file (downloading the remote resource to a 

1231 # temporary file if needed). 

1232 # 2. Use a threshold size and read into memory and use bytes. 

1233 # Use both for now with an arbitrary hand off size. 

1234 # This allows small datasets to be downloaded from remote object 

1235 # stores without requiring a temporary file. 

1236 

1237 formatter = getInfo.formatter 

1238 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1239 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1240 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1241 if cached_file is not None: 

1242 desired_uri = cached_file 

1243 msg = f" (cached version of {uri})" 

1244 else: 

1245 desired_uri = uri 

1246 msg = "" 

1247 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1248 serializedDataset = desired_uri.read() 

1249 log.debug( 

1250 "Deserializing %s from %d bytes from location %s with formatter %s", 

1251 f"component {getInfo.component}" if isComponent else "", 

1252 len(serializedDataset), 

1253 uri, 

1254 formatter.name(), 

1255 ) 

1256 try: 

1257 result = formatter.fromBytes( 

1258 serializedDataset, component=getInfo.component if isComponent else None 

1259 ) 

1260 except Exception as e: 

1261 raise ValueError( 

1262 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1263 f" ({ref.datasetType.name} from {uri}): {e}" 

1264 ) from e 

1265 else: 

1266 # Read from file. 

1267 

1268 # Have to update the Location associated with the formatter 

1269 # because formatter.read does not allow an override. 

1270 # This could be improved. 

1271 location_updated = False 

1272 msg = "" 

1273 

1274 # First check in cache for local version. 

1275 # The cache will only be relevant for remote resources but 

1276 # no harm in always asking. Context manager ensures that cache 

1277 # file is not deleted during cache expiration. 

1278 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1279 if cached_file is not None: 

1280 msg = f"(via cache read of remote file {uri})" 

1281 uri = cached_file 

1282 location_updated = True 

1283 

1284 with uri.as_local() as local_uri: 

1285 

1286 can_be_cached = False 

1287 if uri != local_uri: 1287 ↛ 1289line 1287 didn't jump to line 1289, because the condition on line 1287 was never true

1288 # URI was remote and file was downloaded 

1289 cache_msg = "" 

1290 location_updated = True 

1291 

1292 if self.cacheManager.should_be_cached(cache_ref): 

1293 # In this scenario we want to ask if the downloaded 

1294 # file should be cached but we should not cache 

1295 # it until after we've used it (to ensure it can't 

1296 # be expired whilst we are using it). 

1297 can_be_cached = True 

1298 

1299 # Say that it is "likely" to be cached because 

1300 # if the formatter read fails we will not be 

1301 # caching this file. 

1302 cache_msg = " and likely cached" 

1303 

1304 msg = f"(via download to local file{cache_msg})" 

1305 

1306 # Calculate the (possibly) new location for the formatter 

1307 # to use. 

1308 newLocation = Location(*local_uri.split()) if location_updated else None 

1309 

1310 log.debug( 

1311 "Reading%s from location %s %s with formatter %s", 

1312 f" component {getInfo.component}" if isComponent else "", 

1313 uri, 

1314 msg, 

1315 formatter.name(), 

1316 ) 

1317 try: 

1318 with formatter._updateLocation(newLocation): 

1319 with time_this( 

1320 log, 

1321 msg="Reading%s from location %s %s with formatter %s", 

1322 args=( 

1323 f" component {getInfo.component}" if isComponent else "", 

1324 uri, 

1325 msg, 

1326 formatter.name(), 

1327 ), 

1328 ): 

1329 result = formatter.read(component=getInfo.component if isComponent else None) 

1330 except Exception as e: 

1331 raise ValueError( 

1332 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1333 f" ({ref.datasetType.name} from {uri}): {e}" 

1334 ) from e 

1335 

1336 # File was read successfully so can move to cache 

1337 if can_be_cached: 1337 ↛ 1338line 1337 didn't jump to line 1338, because the condition on line 1337 was never true

1338 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1339 

1340 return self._post_process_get( 

1341 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1342 ) 

1343 

1344 def knows(self, ref: DatasetRef) -> bool: 

1345 """Check if the dataset is known to the datastore. 

1346 

1347 Does not check for existence of any artifact. 

1348 

1349 Parameters 

1350 ---------- 

1351 ref : `DatasetRef` 

1352 Reference to the required dataset. 

1353 

1354 Returns 

1355 ------- 

1356 exists : `bool` 

1357 `True` if the dataset is known to the datastore. 

1358 """ 

1359 fileLocations = self._get_dataset_locations_info(ref) 

1360 if fileLocations: 

1361 return True 

1362 return False 

1363 

1364 def _process_mexists_records( 

1365 self, 

1366 id_to_ref: Dict[DatasetId, DatasetRef], 

1367 records: Dict[DatasetId, List[StoredFileInfo]], 

1368 all_required: bool, 

1369 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1370 ) -> Dict[DatasetRef, bool]: 

1371 """Helper function for mexists that checks the given records. 

1372 

1373 Parameters 

1374 ---------- 

1375 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1376 Mapping of the dataset ID to the dataset ref itself. 

1377 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1378 Records as generally returned by 

1379 ``_get_stored_records_associated_with_refs``. 

1380 all_required : `bool` 

1381 Flag to indicate whether existence requires all artifacts 

1382 associated with a dataset ID to exist or not for existence. 

1383 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1384 Optional mapping of datastore artifact to existence. Updated by 

1385 this method with details of all artifacts tested. Can be `None` 

1386 if the caller is not interested. 

1387 

1388 Returns 

1389 ------- 

1390 existence : `dict` of [`DatasetRef`, `bool`] 

1391 Mapping from dataset to boolean indicating existence. 

1392 """ 

1393 # The URIs to be checked and a mapping of those URIs to 

1394 # the dataset ID. 

1395 uris_to_check: List[ResourcePath] = [] 

1396 location_map: Dict[ResourcePath, DatasetId] = {} 

1397 

1398 location_factory = self.locationFactory 

1399 

1400 for ref_id, info in records.items(): 

1401 # Key is the dataId, value is list of StoredItemInfo 

1402 uris = [info.file_location(location_factory).uri for info in info] 

1403 uris_to_check.extend(uris) 

1404 location_map.update({uri: ref_id for uri in uris}) 

1405 

1406 uri_existence: Dict[ResourcePath, bool] = {} 

1407 if artifact_existence is not None: 

1408 # If a URI has already been checked remove it from the list 

1409 # and immediately add the status to the output dict. 

1410 filtered_uris_to_check = [] 

1411 for uri in uris_to_check: 

1412 if uri in artifact_existence: 

1413 uri_existence[uri] = artifact_existence[uri] 

1414 else: 

1415 filtered_uris_to_check.append(uri) 

1416 uris_to_check = filtered_uris_to_check 

1417 

1418 # Results. 

1419 dataset_existence: Dict[DatasetRef, bool] = {} 

1420 

1421 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1422 for uri, exists in uri_existence.items(): 

1423 dataset_id = location_map[uri] 

1424 ref = id_to_ref[dataset_id] 

1425 

1426 # Disassembled composite needs to check all locations. 

1427 # all_required indicates whether all need to exist or not. 

1428 if ref in dataset_existence: 

1429 if all_required: 

1430 exists = dataset_existence[ref] and exists 

1431 else: 

1432 exists = dataset_existence[ref] or exists 

1433 dataset_existence[ref] = exists 

1434 

1435 if artifact_existence is not None: 

1436 artifact_existence.update(uri_existence) 

1437 

1438 return dataset_existence 

1439 

1440 def mexists( 

1441 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1442 ) -> Dict[DatasetRef, bool]: 

1443 """Check the existence of multiple datasets at once. 

1444 

1445 Parameters 

1446 ---------- 

1447 refs : iterable of `DatasetRef` 

1448 The datasets to be checked. 

1449 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1450 Optional mapping of datastore artifact to existence. Updated by 

1451 this method with details of all artifacts tested. Can be `None` 

1452 if the caller is not interested. 

1453 

1454 Returns 

1455 ------- 

1456 existence : `dict` of [`DatasetRef`, `bool`] 

1457 Mapping from dataset to boolean indicating existence. 

1458 """ 

1459 chunk_size = 10_000 

1460 dataset_existence: Dict[DatasetRef, bool] = {} 

1461 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1462 n_found_total = 0 

1463 n_checked = 0 

1464 n_chunks = 0 

1465 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1466 chunk_result = self._mexists(chunk, artifact_existence) 

1467 if log.isEnabledFor(VERBOSE): 

1468 n_results = len(chunk_result) 

1469 n_checked += n_results 

1470 # Can treat the booleans as 0, 1 integers and sum them. 

1471 n_found = sum(chunk_result.values()) 

1472 n_found_total += n_found 

1473 log.verbose( 

1474 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1475 n_chunks, 

1476 n_found, 

1477 n_results, 

1478 n_found_total, 

1479 n_checked, 

1480 ) 

1481 dataset_existence.update(chunk_result) 

1482 n_chunks += 1 

1483 

1484 return dataset_existence 

1485 

1486 def _mexists( 

1487 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1488 ) -> Dict[DatasetRef, bool]: 

1489 """Check the existence of multiple datasets at once. 

1490 

1491 Parameters 

1492 ---------- 

1493 refs : iterable of `DatasetRef` 

1494 The datasets to be checked. 

1495 

1496 Returns 

1497 ------- 

1498 existence : `dict` of [`DatasetRef`, `bool`] 

1499 Mapping from dataset to boolean indicating existence. 

1500 """ 

1501 # Need a mapping of dataset_id to dataset ref since the API 

1502 # works with dataset_id 

1503 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1504 

1505 # Set of all IDs we are checking for. 

1506 requested_ids = set(id_to_ref.keys()) 

1507 

1508 # The records themselves. Could be missing some entries. 

1509 records = self._get_stored_records_associated_with_refs(refs) 

1510 

1511 dataset_existence = self._process_mexists_records( 

1512 id_to_ref, records, True, artifact_existence=artifact_existence 

1513 ) 

1514 

1515 # Set of IDs that have been handled. 

1516 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1517 

1518 missing_ids = requested_ids - handled_ids 

1519 if missing_ids: 

1520 if not self.trustGetRequest: 

1521 # Must assume these do not exist 

1522 for missing in missing_ids: 

1523 dataset_existence[id_to_ref[missing]] = False 

1524 else: 

1525 log.debug( 

1526 "%d out of %d datasets were not known to datastore during initial existence check.", 

1527 len(missing_ids), 

1528 len(requested_ids), 

1529 ) 

1530 

1531 # Construct data structure identical to that returned 

1532 # by _get_stored_records_associated_with_refs() but using 

1533 # guessed names. 

1534 records = {} 

1535 for missing in missing_ids: 

1536 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1537 records[missing] = [info for _, info in expected] 

1538 

1539 dataset_existence.update( 

1540 self._process_mexists_records( 

1541 id_to_ref, records, False, artifact_existence=artifact_existence 

1542 ) 

1543 ) 

1544 

1545 return dataset_existence 

1546 

1547 def exists(self, ref: DatasetRef) -> bool: 

1548 """Check if the dataset exists in the datastore. 

1549 

1550 Parameters 

1551 ---------- 

1552 ref : `DatasetRef` 

1553 Reference to the required dataset. 

1554 

1555 Returns 

1556 ------- 

1557 exists : `bool` 

1558 `True` if the entity exists in the `Datastore`. 

1559 """ 

1560 fileLocations = self._get_dataset_locations_info(ref) 

1561 

1562 # if we are being asked to trust that registry might not be correct 

1563 # we ask for the expected locations and check them explicitly 

1564 if not fileLocations: 

1565 if not self.trustGetRequest: 

1566 return False 

1567 

1568 # When we are guessing a dataset location we can not check 

1569 # for the existence of every component since we can not 

1570 # know if every component was written. Instead we check 

1571 # for the existence of any of the expected locations. 

1572 for location, _ in self._get_expected_dataset_locations_info(ref): 1572 ↛ 1575line 1572 didn't jump to line 1575, because the loop on line 1572 didn't complete

1573 if self._artifact_exists(location): 1573 ↛ 1572line 1573 didn't jump to line 1572, because the condition on line 1573 was never false

1574 return True 

1575 return False 

1576 

1577 # All listed artifacts must exist. 

1578 for location, _ in fileLocations: 

1579 if not self._artifact_exists(location): 

1580 return False 

1581 

1582 return True 

1583 

1584 def getURIs( 

1585 self, ref: DatasetRef, predict: bool = False 

1586 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1587 """Return URIs associated with dataset. 

1588 

1589 Parameters 

1590 ---------- 

1591 ref : `DatasetRef` 

1592 Reference to the required dataset. 

1593 predict : `bool`, optional 

1594 If the datastore does not know about the dataset, should it 

1595 return a predicted URI or not? 

1596 

1597 Returns 

1598 ------- 

1599 primary : `lsst.resources.ResourcePath` 

1600 The URI to the primary artifact associated with this dataset. 

1601 If the dataset was disassembled within the datastore this 

1602 may be `None`. 

1603 components : `dict` 

1604 URIs to any components associated with the dataset artifact. 

1605 Can be empty if there are no components. 

1606 """ 

1607 

1608 primary: Optional[ResourcePath] = None 

1609 components: Dict[str, ResourcePath] = {} 

1610 

1611 # if this has never been written then we have to guess 

1612 if not self.exists(ref): 

1613 if not predict: 

1614 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1615 

1616 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1617 

1618 if doDisassembly: 

1619 

1620 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1621 compRef = ref.makeComponentRef(component) 

1622 compLocation, _ = self._determine_put_formatter_location(compRef) 

1623 

1624 # Add a URI fragment to indicate this is a guess 

1625 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted") 

1626 

1627 else: 

1628 

1629 location, _ = self._determine_put_formatter_location(ref) 

1630 

1631 # Add a URI fragment to indicate this is a guess 

1632 primary = ResourcePath(location.uri.geturl() + "#predicted") 

1633 

1634 return primary, components 

1635 

1636 # If this is a ref that we have written we can get the path. 

1637 # Get file metadata and internal metadata 

1638 fileLocations = self._get_dataset_locations_info(ref) 

1639 

1640 guessing = False 

1641 if not fileLocations: 

1642 if not self.trustGetRequest: 1642 ↛ 1643line 1642 didn't jump to line 1643, because the condition on line 1642 was never true

1643 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1644 fileLocations = self._get_expected_dataset_locations_info(ref) 

1645 guessing = True 

1646 

1647 if len(fileLocations) == 1: 

1648 # No disassembly so this is the primary URI 

1649 uri = fileLocations[0][0].uri 

1650 if guessing and not uri.exists(): 1650 ↛ 1651line 1650 didn't jump to line 1651, because the condition on line 1650 was never true

1651 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1652 primary = uri 

1653 

1654 else: 

1655 for location, storedFileInfo in fileLocations: 

1656 if storedFileInfo.component is None: 1656 ↛ 1657line 1656 didn't jump to line 1657, because the condition on line 1656 was never true

1657 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1658 uri = location.uri 

1659 if guessing and not uri.exists(): 1659 ↛ 1663line 1659 didn't jump to line 1663, because the condition on line 1659 was never true

1660 # If we are trusting then it is entirely possible for 

1661 # some components to be missing. In that case we skip 

1662 # to the next component. 

1663 if self.trustGetRequest: 

1664 continue 

1665 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1666 components[storedFileInfo.component] = uri 

1667 

1668 return primary, components 

1669 

1670 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1671 """URI to the Dataset. 

1672 

1673 Parameters 

1674 ---------- 

1675 ref : `DatasetRef` 

1676 Reference to the required Dataset. 

1677 predict : `bool` 

1678 If `True`, allow URIs to be returned of datasets that have not 

1679 been written. 

1680 

1681 Returns 

1682 ------- 

1683 uri : `str` 

1684 URI pointing to the dataset within the datastore. If the 

1685 dataset does not exist in the datastore, and if ``predict`` is 

1686 `True`, the URI will be a prediction and will include a URI 

1687 fragment "#predicted". 

1688 If the datastore does not have entities that relate well 

1689 to the concept of a URI the returned URI will be 

1690 descriptive. The returned URI is not guaranteed to be obtainable. 

1691 

1692 Raises 

1693 ------ 

1694 FileNotFoundError 

1695 Raised if a URI has been requested for a dataset that does not 

1696 exist and guessing is not allowed. 

1697 RuntimeError 

1698 Raised if a request is made for a single URI but multiple URIs 

1699 are associated with this dataset. 

1700 

1701 Notes 

1702 ----- 

1703 When a predicted URI is requested an attempt will be made to form 

1704 a reasonable URI based on file templates and the expected formatter. 

1705 """ 

1706 primary, components = self.getURIs(ref, predict) 

1707 if primary is None or components: 1707 ↛ 1708line 1707 didn't jump to line 1708, because the condition on line 1707 was never true

1708 raise RuntimeError( 

1709 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1710 ) 

1711 return primary 

1712 

1713 def retrieveArtifacts( 

1714 self, 

1715 refs: Iterable[DatasetRef], 

1716 destination: ResourcePath, 

1717 transfer: str = "auto", 

1718 preserve_path: bool = True, 

1719 overwrite: bool = False, 

1720 ) -> List[ResourcePath]: 

1721 """Retrieve the file artifacts associated with the supplied refs. 

1722 

1723 Parameters 

1724 ---------- 

1725 refs : iterable of `DatasetRef` 

1726 The datasets for which file artifacts are to be retrieved. 

1727 A single ref can result in multiple files. The refs must 

1728 be resolved. 

1729 destination : `lsst.resources.ResourcePath` 

1730 Location to write the file artifacts. 

1731 transfer : `str`, optional 

1732 Method to use to transfer the artifacts. Must be one of the options 

1733 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1734 "move" is not allowed. 

1735 preserve_path : `bool`, optional 

1736 If `True` the full path of the file artifact within the datastore 

1737 is preserved. If `False` the final file component of the path 

1738 is used. 

1739 overwrite : `bool`, optional 

1740 If `True` allow transfers to overwrite existing files at the 

1741 destination. 

1742 

1743 Returns 

1744 ------- 

1745 targets : `list` of `lsst.resources.ResourcePath` 

1746 URIs of file artifacts in destination location. Order is not 

1747 preserved. 

1748 """ 

1749 if not destination.isdir(): 1749 ↛ 1750line 1749 didn't jump to line 1750, because the condition on line 1749 was never true

1750 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1751 

1752 if transfer == "move": 

1753 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1754 

1755 # Source -> Destination 

1756 # This also helps filter out duplicate DatasetRef in the request 

1757 # that will map to the same underlying file transfer. 

1758 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1759 

1760 for ref in refs: 

1761 locations = self._get_dataset_locations_info(ref) 

1762 for location, _ in locations: 

1763 source_uri = location.uri 

1764 target_path: ResourcePathExpression 

1765 if preserve_path: 

1766 target_path = location.pathInStore 

1767 if target_path.isabs(): 1767 ↛ 1770line 1767 didn't jump to line 1770, because the condition on line 1767 was never true

1768 # This is an absolute path to an external file. 

1769 # Use the full path. 

1770 target_path = target_path.relativeToPathRoot 

1771 else: 

1772 target_path = source_uri.basename() 

1773 target_uri = destination.join(target_path) 

1774 to_transfer[source_uri] = target_uri 

1775 

1776 # In theory can now parallelize the transfer 

1777 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1778 for source_uri, target_uri in to_transfer.items(): 

1779 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1780 

1781 return list(to_transfer.values()) 

1782 

1783 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1784 """Load an InMemoryDataset from the store. 

1785 

1786 Parameters 

1787 ---------- 

1788 ref : `DatasetRef` 

1789 Reference to the required Dataset. 

1790 parameters : `dict` 

1791 `StorageClass`-specific parameters that specify, for example, 

1792 a slice of the dataset to be loaded. 

1793 

1794 Returns 

1795 ------- 

1796 inMemoryDataset : `object` 

1797 Requested dataset or slice thereof as an InMemoryDataset. 

1798 

1799 Raises 

1800 ------ 

1801 FileNotFoundError 

1802 Requested dataset can not be retrieved. 

1803 TypeError 

1804 Return value from formatter has unexpected type. 

1805 ValueError 

1806 Formatter failed to process the dataset. 

1807 """ 

1808 allGetInfo = self._prepare_for_get(ref, parameters) 

1809 refComponent = ref.datasetType.component() 

1810 

1811 # Supplied storage class for the component being read 

1812 refStorageClass = ref.datasetType.storageClass 

1813 

1814 # Create mapping from component name to related info 

1815 allComponents = {i.component: i for i in allGetInfo} 

1816 

1817 # By definition the dataset is disassembled if we have more 

1818 # than one record for it. 

1819 isDisassembled = len(allGetInfo) > 1 

1820 

1821 # Look for the special case where we are disassembled but the 

1822 # component is a derived component that was not written during 

1823 # disassembly. For this scenario we need to check that the 

1824 # component requested is listed as a derived component for the 

1825 # composite storage class 

1826 isDisassembledReadOnlyComponent = False 

1827 if isDisassembled and refComponent: 

1828 # The composite storage class should be accessible through 

1829 # the component dataset type 

1830 compositeStorageClass = ref.datasetType.parentStorageClass 

1831 

1832 # In the unlikely scenario where the composite storage 

1833 # class is not known, we can only assume that this is a 

1834 # normal component. If that assumption is wrong then the 

1835 # branch below that reads a persisted component will fail 

1836 # so there is no need to complain here. 

1837 if compositeStorageClass is not None: 1837 ↛ 1840line 1837 didn't jump to line 1840, because the condition on line 1837 was never false

1838 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1839 

1840 if isDisassembled and not refComponent: 

1841 # This was a disassembled dataset spread over multiple files 

1842 # and we need to put them all back together again. 

1843 # Read into memory and then assemble 

1844 

1845 # Check that the supplied parameters are suitable for the type read 

1846 refStorageClass.validateParameters(parameters) 

1847 

1848 # We want to keep track of all the parameters that were not used 

1849 # by formatters. We assume that if any of the component formatters 

1850 # use a parameter that we do not need to apply it again in the 

1851 # assembler. 

1852 usedParams = set() 

1853 

1854 components: Dict[str, Any] = {} 

1855 for getInfo in allGetInfo: 

1856 # assemblerParams are parameters not understood by the 

1857 # associated formatter. 

1858 usedParams.update(set(getInfo.formatterParams)) 

1859 

1860 component = getInfo.component 

1861 

1862 if component is None: 1862 ↛ 1863line 1862 didn't jump to line 1863, because the condition on line 1862 was never true

1863 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1864 

1865 # We do not want the formatter to think it's reading 

1866 # a component though because it is really reading a 

1867 # standalone dataset -- always tell reader it is not a 

1868 # component. 

1869 components[component] = self._read_artifact_into_memory( 

1870 getInfo, ref.makeComponentRef(component), isComponent=False 

1871 ) 

1872 

1873 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1874 

1875 # Any unused parameters will have to be passed to the assembler 

1876 if parameters: 

1877 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1878 else: 

1879 unusedParams = {} 

1880 

1881 # Process parameters 

1882 return ref.datasetType.storageClass.delegate().handleParameters( 

1883 inMemoryDataset, parameters=unusedParams 

1884 ) 

1885 

1886 elif isDisassembledReadOnlyComponent: 

1887 

1888 compositeStorageClass = ref.datasetType.parentStorageClass 

1889 if compositeStorageClass is None: 1889 ↛ 1890line 1889 didn't jump to line 1890, because the condition on line 1889 was never true

1890 raise RuntimeError( 

1891 f"Unable to retrieve derived component '{refComponent}' since" 

1892 "no composite storage class is available." 

1893 ) 

1894 

1895 if refComponent is None: 1895 ↛ 1897line 1895 didn't jump to line 1897, because the condition on line 1895 was never true

1896 # Mainly for mypy 

1897 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1898 

1899 # Assume that every derived component can be calculated by 

1900 # forwarding the request to a single read/write component. 

1901 # Rather than guessing which rw component is the right one by 

1902 # scanning each for a derived component of the same name, 

1903 # we ask the storage class delegate directly which one is best to 

1904 # use. 

1905 compositeDelegate = compositeStorageClass.delegate() 

1906 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

1907 refComponent, set(allComponents) 

1908 ) 

1909 

1910 # Select the relevant component 

1911 rwInfo = allComponents[forwardedComponent] 

1912 

1913 # For now assume that read parameters are validated against 

1914 # the real component and not the requested component 

1915 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1916 forwardedStorageClass.validateParameters(parameters) 

1917 

1918 # The reference to use for the caching must refer to the forwarded 

1919 # component and not the derived component. 

1920 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1921 

1922 # Unfortunately the FileDescriptor inside the formatter will have 

1923 # the wrong write storage class so we need to create a new one 

1924 # given the immutability constraint. 

1925 writeStorageClass = rwInfo.info.storageClass 

1926 

1927 # We may need to put some thought into parameters for read 

1928 # components but for now forward them on as is 

1929 readFormatter = type(rwInfo.formatter)( 

1930 FileDescriptor( 

1931 rwInfo.location, 

1932 readStorageClass=refStorageClass, 

1933 storageClass=writeStorageClass, 

1934 parameters=parameters, 

1935 ), 

1936 ref.dataId, 

1937 ) 

1938 

1939 # The assembler can not receive any parameter requests for a 

1940 # derived component at this time since the assembler will 

1941 # see the storage class of the derived component and those 

1942 # parameters will have to be handled by the formatter on the 

1943 # forwarded storage class. 

1944 assemblerParams: Dict[str, Any] = {} 

1945 

1946 # Need to created a new info that specifies the derived 

1947 # component and associated storage class 

1948 readInfo = DatastoreFileGetInformation( 

1949 rwInfo.location, 

1950 readFormatter, 

1951 rwInfo.info, 

1952 assemblerParams, 

1953 {}, 

1954 refComponent, 

1955 refStorageClass, 

1956 ) 

1957 

1958 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

1959 

1960 else: 

1961 # Single file request or component from that composite file 

1962 for lookup in (refComponent, None): 1962 ↛ 1967line 1962 didn't jump to line 1967, because the loop on line 1962 didn't complete

1963 if lookup in allComponents: 1963 ↛ 1962line 1963 didn't jump to line 1962, because the condition on line 1963 was never false

1964 getInfo = allComponents[lookup] 

1965 break 

1966 else: 

1967 raise FileNotFoundError( 

1968 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

1969 ) 

1970 

1971 # Do not need the component itself if already disassembled 

1972 if isDisassembled: 

1973 isComponent = False 

1974 else: 

1975 isComponent = getInfo.component is not None 

1976 

1977 # For a component read of a composite we want the cache to 

1978 # be looking at the composite ref itself. 

1979 cache_ref = ref.makeCompositeRef() if isComponent else ref 

1980 

1981 # For a disassembled component we can validate parametersagainst 

1982 # the component storage class directly 

1983 if isDisassembled: 

1984 refStorageClass.validateParameters(parameters) 

1985 else: 

1986 # For an assembled composite this could be a derived 

1987 # component derived from a real component. The validity 

1988 # of the parameters is not clear. For now validate against 

1989 # the composite storage class 

1990 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1991 

1992 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

1993 

1994 @transactional 

1995 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1996 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1997 

1998 Parameters 

1999 ---------- 

2000 inMemoryDataset : `object` 

2001 The dataset to store. 

2002 ref : `DatasetRef` 

2003 Reference to the associated Dataset. 

2004 

2005 Raises 

2006 ------ 

2007 TypeError 

2008 Supplied object and storage class are inconsistent. 

2009 DatasetTypeNotSupportedError 

2010 The associated `DatasetType` is not handled by this datastore. 

2011 

2012 Notes 

2013 ----- 

2014 If the datastore is configured to reject certain dataset types it 

2015 is possible that the put will fail and raise a 

2016 `DatasetTypeNotSupportedError`. The main use case for this is to 

2017 allow `ChainedDatastore` to put to multiple datastores without 

2018 requiring that every datastore accepts the dataset. 

2019 """ 

2020 

2021 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2022 # doDisassembly = True 

2023 

2024 artifacts = [] 

2025 if doDisassembly: 

2026 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2027 for component, componentInfo in components.items(): 

2028 # Don't recurse because we want to take advantage of 

2029 # bulk insert -- need a new DatasetRef that refers to the 

2030 # same dataset_id but has the component DatasetType 

2031 # DatasetType does not refer to the types of components 

2032 # So we construct one ourselves. 

2033 compRef = ref.makeComponentRef(component) 

2034 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2035 artifacts.append((compRef, storedInfo)) 

2036 else: 

2037 # Write the entire thing out 

2038 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2039 artifacts.append((ref, storedInfo)) 

2040 

2041 self._register_datasets(artifacts) 

2042 

2043 @transactional 

2044 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2045 # At this point can safely remove these datasets from the cache 

2046 # to avoid confusion later on. If they are not trashed later 

2047 # the cache will simply be refilled. 

2048 self.cacheManager.remove_from_cache(ref) 

2049 

2050 # If we are in trust mode there will be nothing to move to 

2051 # the trash table and we will have to try to delete the file 

2052 # immediately. 

2053 if self.trustGetRequest: 

2054 # Try to keep the logic below for a single file trash. 

2055 if isinstance(ref, DatasetRef): 

2056 refs = {ref} 

2057 else: 

2058 # Will recreate ref at the end of this branch. 

2059 refs = set(ref) 

2060 

2061 # Determine which datasets are known to datastore directly. 

2062 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2063 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2064 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2065 

2066 missing = refs - existing_refs 

2067 if missing: 

2068 # Do an explicit existence check on these refs. 

2069 # We only care about the artifacts at this point and not 

2070 # the dataset existence. 

2071 artifact_existence: Dict[ResourcePath, bool] = {} 

2072 _ = self.mexists(missing, artifact_existence) 

2073 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2074 

2075 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2076 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2077 for uri in uris: 

2078 try: 

2079 uri.remove() 

2080 except Exception as e: 

2081 if ignore_errors: 

2082 log.debug("Artifact %s could not be removed: %s", uri, e) 

2083 continue 

2084 raise 

2085 

2086 # There is no point asking the code below to remove refs we 

2087 # know are missing so update it with the list of existing 

2088 # records. Try to retain one vs many logic. 

2089 if not existing_refs: 

2090 # Nothing more to do since none of the datasets were 

2091 # known to the datastore record table. 

2092 return 

2093 ref = list(existing_refs) 

2094 if len(ref) == 1: 

2095 ref = ref[0] 

2096 

2097 # Get file metadata and internal metadata 

2098 if not isinstance(ref, DatasetRef): 

2099 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2100 # Assumed to be an iterable of refs so bulk mode enabled. 

2101 try: 

2102 self.bridge.moveToTrash(ref) 

2103 except Exception as e: 

2104 if ignore_errors: 

2105 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2106 else: 

2107 raise 

2108 return 

2109 

2110 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2111 

2112 fileLocations = self._get_dataset_locations_info(ref) 

2113 

2114 if not fileLocations: 

2115 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2116 if ignore_errors: 

2117 log.warning(err_msg) 

2118 return 

2119 else: 

2120 raise FileNotFoundError(err_msg) 

2121 

2122 for location, storedFileInfo in fileLocations: 

2123 if not self._artifact_exists(location): 2123 ↛ 2124line 2123 didn't jump to line 2124

2124 err_msg = ( 

2125 f"Dataset is known to datastore {self.name} but " 

2126 f"associated artifact ({location.uri}) is missing" 

2127 ) 

2128 if ignore_errors: 

2129 log.warning(err_msg) 

2130 return 

2131 else: 

2132 raise FileNotFoundError(err_msg) 

2133 

2134 # Mark dataset as trashed 

2135 try: 

2136 self.bridge.moveToTrash([ref]) 

2137 except Exception as e: 

2138 if ignore_errors: 

2139 log.warning( 

2140 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2141 "but encountered an error: %s", 

2142 ref, 

2143 self.name, 

2144 e, 

2145 ) 

2146 pass 

2147 else: 

2148 raise 

2149 

2150 @transactional 

2151 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2152 """Remove all datasets from the trash. 

2153 

2154 Parameters 

2155 ---------- 

2156 ignore_errors : `bool` 

2157 If `True` return without error even if something went wrong. 

2158 Problems could occur if another process is simultaneously trying 

2159 to delete. 

2160 """ 

2161 log.debug("Emptying trash in datastore %s", self.name) 

2162 

2163 # Context manager will empty trash iff we finish it without raising. 

2164 # It will also automatically delete the relevant rows from the 

2165 # trash table and the records table. 

2166 with self.bridge.emptyTrash( 

2167 self._table, record_class=StoredFileInfo, record_column="path" 

2168 ) as trash_data: 

2169 # Removing the artifacts themselves requires that the files are 

2170 # not also associated with refs that are not to be trashed. 

2171 # Therefore need to do a query with the file paths themselves 

2172 # and return all the refs associated with them. Can only delete 

2173 # a file if the refs to be trashed are the only refs associated 

2174 # with the file. 

2175 # This requires multiple copies of the trashed items 

2176 trashed, artifacts_to_keep = trash_data 

2177 

2178 if artifacts_to_keep is None: 

2179 # The bridge is not helping us so have to work it out 

2180 # ourselves. This is not going to be as efficient. 

2181 trashed = list(trashed) 

2182 

2183 # The instance check is for mypy since up to this point it 

2184 # does not know the type of info. 

2185 path_map = self._refs_associated_with_artifacts( 

2186 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2187 ) 

2188 

2189 for ref, info in trashed: 

2190 

2191 # Mypy needs to know this is not the base class 

2192 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2193 

2194 # Check for mypy 

2195 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2196 

2197 path_map[info.path].remove(ref.id) 

2198 if not path_map[info.path]: 2198 ↛ 2189line 2198 didn't jump to line 2189, because the condition on line 2198 was never false

2199 del path_map[info.path] 

2200 

2201 artifacts_to_keep = set(path_map) 

2202 

2203 for ref, info in trashed: 

2204 

2205 # Should not happen for this implementation but need 

2206 # to keep mypy happy. 

2207 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2208 

2209 # Mypy needs to know this is not the base class 

2210 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2211 

2212 # Check for mypy 

2213 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2214 

2215 if info.path in artifacts_to_keep: 

2216 # This is a multi-dataset artifact and we are not 

2217 # removing all associated refs. 

2218 continue 

2219 

2220 # Only trashed refs still known to datastore will be returned. 

2221 location = info.file_location(self.locationFactory) 

2222 

2223 # Point of no return for this artifact 

2224 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2225 try: 

2226 self._delete_artifact(location) 

2227 except FileNotFoundError: 

2228 # If the file itself has been deleted there is nothing 

2229 # we can do about it. It is possible that trash has 

2230 # been run in parallel in another process or someone 

2231 # decided to delete the file. It is unlikely to come 

2232 # back and so we should still continue with the removal 

2233 # of the entry from the trash table. It is also possible 

2234 # we removed it in a previous iteration if it was 

2235 # a multi-dataset artifact. The delete artifact method 

2236 # will log a debug message in this scenario. 

2237 # Distinguishing file missing before trash started and 

2238 # file already removed previously as part of this trash 

2239 # is not worth the distinction with regards to potential 

2240 # memory cost. 

2241 pass 

2242 except Exception as e: 

2243 if ignore_errors: 

2244 # Use a debug message here even though it's not 

2245 # a good situation. In some cases this can be 

2246 # caused by a race between user A and user B 

2247 # and neither of them has permissions for the 

2248 # other's files. Butler does not know about users 

2249 # and trash has no idea what collections these 

2250 # files were in (without guessing from a path). 

2251 log.debug( 

2252 "Encountered error removing artifact %s from datastore %s: %s", 

2253 location.uri, 

2254 self.name, 

2255 e, 

2256 ) 

2257 else: 

2258 raise 

2259 

2260 @transactional 

2261 def transfer_from( 

2262 self, 

2263 source_datastore: Datastore, 

2264 refs: Iterable[DatasetRef], 

2265 local_refs: Optional[Iterable[DatasetRef]] = None, 

2266 transfer: str = "auto", 

2267 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2268 ) -> None: 

2269 # Docstring inherited 

2270 if type(self) is not type(source_datastore): 

2271 raise TypeError( 

2272 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2273 f"source datastore ({type(source_datastore)})." 

2274 ) 

2275 

2276 # Be explicit for mypy 

2277 if not isinstance(source_datastore, FileDatastore): 2277 ↛ 2278line 2277 didn't jump to line 2278, because the condition on line 2277 was never true

2278 raise TypeError( 

2279 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2280 f" {type(source_datastore)}" 

2281 ) 

2282 

2283 # Stop early if "direct" transfer mode is requested. That would 

2284 # require that the URI inside the source datastore should be stored 

2285 # directly in the target datastore, which seems unlikely to be useful 

2286 # since at any moment the source datastore could delete the file. 

2287 if transfer in ("direct", "split"): 

2288 raise ValueError( 

2289 f"Can not transfer from a source datastore using {transfer} mode since" 

2290 " those files are controlled by the other datastore." 

2291 ) 

2292 

2293 # Empty existence lookup if none given. 

2294 if artifact_existence is None: 

2295 artifact_existence = {} 

2296 

2297 # We will go through the list multiple times so must convert 

2298 # generators to lists. 

2299 refs = list(refs) 

2300 

2301 if local_refs is None: 

2302 local_refs = refs 

2303 else: 

2304 local_refs = list(local_refs) 

2305 

2306 # In order to handle disassembled composites the code works 

2307 # at the records level since it can assume that internal APIs 

2308 # can be used. 

2309 # - If the record already exists in the destination this is assumed 

2310 # to be okay. 

2311 # - If there is no record but the source and destination URIs are 

2312 # identical no transfer is done but the record is added. 

2313 # - If the source record refers to an absolute URI currently assume 

2314 # that that URI should remain absolute and will be visible to the 

2315 # destination butler. May need to have a flag to indicate whether 

2316 # the dataset should be transferred. This will only happen if 

2317 # the detached Butler has had a local ingest. 

2318 

2319 # What we really want is all the records in the source datastore 

2320 # associated with these refs. Or derived ones if they don't exist 

2321 # in the source. 

2322 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2323 

2324 # The source dataset_ids are the keys in these records 

2325 source_ids = set(source_records) 

2326 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2327 

2328 # The not None check is to appease mypy 

2329 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2330 missing_ids = requested_ids - source_ids 

2331 

2332 # Missing IDs can be okay if that datastore has allowed 

2333 # gets based on file existence. Should we transfer what we can 

2334 # or complain about it and warn? 

2335 if missing_ids and not source_datastore.trustGetRequest: 2335 ↛ 2336line 2335 didn't jump to line 2336, because the condition on line 2335 was never true

2336 raise ValueError( 

2337 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2338 ) 

2339 

2340 # Need to map these missing IDs to a DatasetRef so we can guess 

2341 # the details. 

2342 if missing_ids: 

2343 log.info( 

2344 "Number of expected datasets missing from source datastore records: %d out of %d", 

2345 len(missing_ids), 

2346 len(requested_ids), 

2347 ) 

2348 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2349 

2350 # This should be chunked in case we end up having to check 

2351 # the file store since we need some log output to show 

2352 # progress. 

2353 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2354 records = {} 

2355 for missing in missing_ids_chunk: 

2356 # Ask the source datastore where the missing artifacts 

2357 # should be. An execution butler might not know about the 

2358 # artifacts even if they are there. 

2359 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2360 records[missing] = [info for _, info in expected] 

2361 

2362 # Call the mexist helper method in case we have not already 

2363 # checked these artifacts such that artifact_existence is 

2364 # empty. This allows us to benefit from parallelism. 

2365 # datastore.mexists() itself does not give us access to the 

2366 # derived datastore record. 

2367 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2368 ref_exists = source_datastore._process_mexists_records( 

2369 id_to_ref, records, False, artifact_existence=artifact_existence 

2370 ) 

2371 

2372 # Now go through the records and propagate the ones that exist. 

2373 location_factory = source_datastore.locationFactory 

2374 for missing, record_list in records.items(): 

2375 # Skip completely if the ref does not exist. 

2376 ref = id_to_ref[missing] 

2377 if not ref_exists[ref]: 

2378 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2379 continue 

2380 # Check for file artifact to decide which parts of a 

2381 # disassembled composite do exist. If there is only a 

2382 # single record we don't even need to look because it can't 

2383 # be a composite and must exist. 

2384 if len(record_list) == 1: 

2385 dataset_records = record_list 

2386 else: 

2387 dataset_records = [ 

2388 record 

2389 for record in record_list 

2390 if artifact_existence[record.file_location(location_factory).uri] 

2391 ] 

2392 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2393 

2394 # Rely on source_records being a defaultdict. 

2395 source_records[missing].extend(dataset_records) 

2396 

2397 # See if we already have these records 

2398 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2399 

2400 # The artifacts to register 

2401 artifacts = [] 

2402 

2403 # Refs that already exist 

2404 already_present = [] 

2405 

2406 # Now can transfer the artifacts 

2407 for source_ref, target_ref in zip(refs, local_refs): 

2408 if target_ref.id in target_records: 

2409 # Already have an artifact for this. 

2410 already_present.append(target_ref) 

2411 continue 

2412 

2413 # mypy needs to know these are always resolved refs 

2414 for info in source_records[source_ref.getCheckedId()]: 

2415 source_location = info.file_location(source_datastore.locationFactory) 

2416 target_location = info.file_location(self.locationFactory) 

2417 if source_location == target_location: 2417 ↛ 2421line 2417 didn't jump to line 2421, because the condition on line 2417 was never true

2418 # Either the dataset is already in the target datastore 

2419 # (which is how execution butler currently runs) or 

2420 # it is an absolute URI. 

2421 if source_location.pathInStore.isabs(): 

2422 # Just because we can see the artifact when running 

2423 # the transfer doesn't mean it will be generally 

2424 # accessible to a user of this butler. For now warn 

2425 # but assume it will be accessible. 

2426 log.warning( 

2427 "Transfer request for an outside-datastore artifact has been found at %s", 

2428 source_location, 

2429 ) 

2430 else: 

2431 # Need to transfer it to the new location. 

2432 # Assume we should always overwrite. If the artifact 

2433 # is there this might indicate that a previous transfer 

2434 # was interrupted but was not able to be rolled back 

2435 # completely (eg pre-emption) so follow Datastore default 

2436 # and overwrite. 

2437 target_location.uri.transfer_from( 

2438 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2439 ) 

2440 

2441 artifacts.append((target_ref, info)) 

2442 

2443 self._register_datasets(artifacts) 

2444 

2445 if already_present: 

2446 n_skipped = len(already_present) 

2447 log.info( 

2448 "Skipped transfer of %d dataset%s already present in datastore", 

2449 n_skipped, 

2450 "" if n_skipped == 1 else "s", 

2451 ) 

2452 

2453 @transactional 

2454 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2455 # Docstring inherited. 

2456 refs = list(refs) 

2457 self.bridge.forget(refs) 

2458 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2459 

2460 def validateConfiguration( 

2461 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2462 ) -> None: 

2463 """Validate some of the configuration for this datastore. 

2464 

2465 Parameters 

2466 ---------- 

2467 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2468 Entities to test against this configuration. Can be differing 

2469 types. 

2470 logFailures : `bool`, optional 

2471 If `True`, output a log message for every validation error 

2472 detected. 

2473 

2474 Raises 

2475 ------ 

2476 DatastoreValidationError 

2477 Raised if there is a validation problem with a configuration. 

2478 All the problems are reported in a single exception. 

2479 

2480 Notes 

2481 ----- 

2482 This method checks that all the supplied entities have valid file 

2483 templates and also have formatters defined. 

2484 """ 

2485 

2486 templateFailed = None 

2487 try: 

2488 self.templates.validateTemplates(entities, logFailures=logFailures) 

2489 except FileTemplateValidationError as e: 

2490 templateFailed = str(e) 

2491 

2492 formatterFailed = [] 

2493 for entity in entities: 

2494 try: 

2495 self.formatterFactory.getFormatterClass(entity) 

2496 except KeyError as e: 

2497 formatterFailed.append(str(e)) 

2498 if logFailures: 2498 ↛ 2493line 2498 didn't jump to line 2493, because the condition on line 2498 was never false

2499 log.critical("Formatter failure: %s", e) 

2500 

2501 if templateFailed or formatterFailed: 

2502 messages = [] 

2503 if templateFailed: 2503 ↛ 2504line 2503 didn't jump to line 2504, because the condition on line 2503 was never true

2504 messages.append(templateFailed) 

2505 if formatterFailed: 2505 ↛ 2507line 2505 didn't jump to line 2507, because the condition on line 2505 was never false

2506 messages.append(",".join(formatterFailed)) 

2507 msg = ";\n".join(messages) 

2508 raise DatastoreValidationError(msg) 

2509 

2510 def getLookupKeys(self) -> Set[LookupKey]: 

2511 # Docstring is inherited from base class 

2512 return ( 

2513 self.templates.getLookupKeys() 

2514 | self.formatterFactory.getLookupKeys() 

2515 | self.constraints.getLookupKeys() 

2516 ) 

2517 

2518 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2519 # Docstring is inherited from base class 

2520 # The key can be valid in either formatters or templates so we can 

2521 # only check the template if it exists 

2522 if lookupKey in self.templates: 

2523 try: 

2524 self.templates[lookupKey].validateTemplate(entity) 

2525 except FileTemplateValidationError as e: 

2526 raise DatastoreValidationError(e) from e 

2527 

2528 def export( 

2529 self, 

2530 refs: Iterable[DatasetRef], 

2531 *, 

2532 directory: Optional[ResourcePathExpression] = None, 

2533 transfer: Optional[str] = "auto", 

2534 ) -> Iterable[FileDataset]: 

2535 # Docstring inherited from Datastore.export. 

2536 if transfer is not None and directory is None: 2536 ↛ 2537line 2536 didn't jump to line 2537, because the condition on line 2536 was never true

2537 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2538 

2539 # Force the directory to be a URI object 

2540 directoryUri: Optional[ResourcePath] = None 

2541 if directory is not None: 2541 ↛ 2544line 2541 didn't jump to line 2544, because the condition on line 2541 was never false

2542 directoryUri = ResourcePath(directory, forceDirectory=True) 

2543 

2544 if transfer is not None and directoryUri is not None: 2544 ↛ 2549line 2544 didn't jump to line 2549, because the condition on line 2544 was never false

2545 # mypy needs the second test 

2546 if not directoryUri.exists(): 2546 ↛ 2547line 2546 didn't jump to line 2547, because the condition on line 2546 was never true

2547 raise FileNotFoundError(f"Export location {directory} does not exist") 

2548 

2549 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2550 for ref in progress.wrap(refs, "Exporting dataset files"): 

2551 fileLocations = self._get_dataset_locations_info(ref) 

2552 if not fileLocations: 2552 ↛ 2553line 2552 didn't jump to line 2553, because the condition on line 2552 was never true

2553 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2554 # For now we can not export disassembled datasets 

2555 if len(fileLocations) > 1: 

2556 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2557 location, storedFileInfo = fileLocations[0] 

2558 

2559 pathInStore = location.pathInStore.path 

2560 if transfer is None: 2560 ↛ 2564line 2560 didn't jump to line 2564, because the condition on line 2560 was never true

2561 # TODO: do we also need to return the readStorageClass somehow? 

2562 # We will use the path in store directly. If this is an 

2563 # absolute URI, preserve it. 

2564 if location.pathInStore.isabs(): 

2565 pathInStore = str(location.uri) 

2566 elif transfer == "direct": 2566 ↛ 2568line 2566 didn't jump to line 2568, because the condition on line 2566 was never true

2567 # Use full URIs to the remote store in the export 

2568 pathInStore = str(location.uri) 

2569 else: 

2570 # mypy needs help 

2571 assert directoryUri is not None, "directoryUri must be defined to get here" 

2572 storeUri = ResourcePath(location.uri) 

2573 

2574 # if the datastore has an absolute URI to a resource, we 

2575 # have two options: 

2576 # 1. Keep the absolute URI in the exported YAML 

2577 # 2. Allocate a new name in the local datastore and transfer 

2578 # it. 

2579 # For now go with option 2 

2580 if location.pathInStore.isabs(): 2580 ↛ 2581line 2580 didn't jump to line 2581, because the condition on line 2580 was never true

2581 template = self.templates.getTemplate(ref) 

2582 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2583 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2584 

2585 exportUri = directoryUri.join(pathInStore) 

2586 exportUri.transfer_from(storeUri, transfer=transfer) 

2587 

2588 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2589 

2590 @staticmethod 

2591 def computeChecksum( 

2592 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2593 ) -> Optional[str]: 

2594 """Compute the checksum of the supplied file. 

2595 

2596 Parameters 

2597 ---------- 

2598 uri : `lsst.resources.ResourcePath` 

2599 Name of resource to calculate checksum from. 

2600 algorithm : `str`, optional 

2601 Name of algorithm to use. Must be one of the algorithms supported 

2602 by :py:class`hashlib`. 

2603 block_size : `int` 

2604 Number of bytes to read from file at one time. 

2605 

2606 Returns 

2607 ------- 

2608 hexdigest : `str` 

2609 Hex digest of the file. 

2610 

2611 Notes 

2612 ----- 

2613 Currently returns None if the URI is for a remote resource. 

2614 """ 

2615 if algorithm not in hashlib.algorithms_guaranteed: 2615 ↛ 2616line 2615 didn't jump to line 2616, because the condition on line 2615 was never true

2616 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2617 

2618 if not uri.isLocal: 2618 ↛ 2619line 2618 didn't jump to line 2619, because the condition on line 2618 was never true

2619 return None 

2620 

2621 hasher = hashlib.new(algorithm) 

2622 

2623 with uri.as_local() as local_uri: 

2624 with open(local_uri.ospath, "rb") as f: 

2625 for chunk in iter(lambda: f.read(block_size), b""): 

2626 hasher.update(chunk) 

2627 

2628 return hasher.hexdigest() 

2629 

2630 def needs_expanded_data_ids( 

2631 self, 

2632 transfer: Optional[str], 

2633 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2634 ) -> bool: 

2635 # Docstring inherited. 

2636 # This _could_ also use entity to inspect whether the filename template 

2637 # involves placeholders other than the required dimensions for its 

2638 # dataset type, but that's not necessary for correctness; it just 

2639 # enables more optimizations (perhaps only in theory). 

2640 return transfer not in ("direct", None)