Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 83%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

839 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Type, 

43 Union, 

44) 

45 

46from lsst.daf.butler import ( 

47 ButlerURI, 

48 CompositesMap, 

49 Config, 

50 DatasetId, 

51 DatasetRef, 

52 DatasetType, 

53 DatasetTypeNotSupportedError, 

54 Datastore, 

55 DatastoreCacheManager, 

56 DatastoreConfig, 

57 DatastoreDisabledCacheManager, 

58 DatastoreValidationError, 

59 FileDataset, 

60 FileDescriptor, 

61 FileTemplates, 

62 FileTemplateValidationError, 

63 Formatter, 

64 FormatterFactory, 

65 Location, 

66 LocationFactory, 

67 Progress, 

68 StorageClass, 

69 StoredFileInfo, 

70 ddl, 

71) 

72from lsst.daf.butler.core.repoRelocation import replaceRoot 

73from lsst.daf.butler.core.utils import transactional 

74from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

75from lsst.utils.introspection import get_class_of, get_instance_of 

76from lsst.utils.iteration import chunk_iterable 

77 

78# For VERBOSE logging usage. 

79from lsst.utils.logging import VERBOSE, getLogger 

80from lsst.utils.timer import time_this 

81from sqlalchemy import BigInteger, String 

82 

83from .genericDatastore import GenericBaseDatastore 

84 

85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

88 

89log = getLogger(__name__) 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 

101 def __init__(self, datasets: List[FileDataset]): 

102 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

103 self.datasets = datasets 

104 

105 

106@dataclass(frozen=True) 

107class DatastoreFileGetInformation: 

108 """Collection of useful parameters needed to retrieve a file from 

109 a Datastore. 

110 """ 

111 

112 location: Location 

113 """The location from which to read the dataset.""" 

114 

115 formatter: Formatter 

116 """The `Formatter` to use to deserialize the dataset.""" 

117 

118 info: StoredFileInfo 

119 """Stored information about this file and its formatter.""" 

120 

121 assemblerParams: Dict[str, Any] 

122 """Parameters to use for post-processing the retrieved dataset.""" 

123 

124 formatterParams: Dict[str, Any] 

125 """Parameters that were understood by the associated formatter.""" 

126 

127 component: Optional[str] 

128 """The component to be retrieved (can be `None`).""" 

129 

130 readStorageClass: StorageClass 

131 """The `StorageClass` of the dataset being read.""" 

132 

133 

134class FileDatastore(GenericBaseDatastore): 

135 """Generic Datastore for file-based implementations. 

136 

137 Should always be sub-classed since key abstract methods are missing. 

138 

139 Parameters 

140 ---------- 

141 config : `DatastoreConfig` or `str` 

142 Configuration as either a `Config` object or URI to file. 

143 bridgeManager : `DatastoreRegistryBridgeManager` 

144 Object that manages the interface between `Registry` and datastores. 

145 butlerRoot : `str`, optional 

146 New datastore root to use to override the configuration value. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 If root location does not exist and ``create`` is `False` in the 

152 configuration. 

153 """ 

154 

155 defaultConfigFile: ClassVar[Optional[str]] = None 

156 """Path to configuration defaults. Accessed within the ``config`` resource 

157 or relative to a search path. Can be None if no defaults specified. 

158 """ 

159 

160 root: ButlerURI 

161 """Root directory URI of this `Datastore`.""" 

162 

163 locationFactory: LocationFactory 

164 """Factory for creating locations relative to the datastore root.""" 

165 

166 formatterFactory: FormatterFactory 

167 """Factory for creating instances of formatters.""" 

168 

169 templates: FileTemplates 

170 """File templates that can be used by this `Datastore`.""" 

171 

172 composites: CompositesMap 

173 """Determines whether a dataset should be disassembled on put.""" 

174 

175 defaultConfigFile = "datastores/fileDatastore.yaml" 

176 """Path to configuration defaults. Accessed within the ``config`` resource 

177 or relative to a search path. Can be None if no defaults specified. 

178 """ 

179 

180 @classmethod 

181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

182 """Set any filesystem-dependent config options for this Datastore to 

183 be appropriate for a new empty repository with the given root. 

184 

185 Parameters 

186 ---------- 

187 root : `str` 

188 URI to the root of the data repository. 

189 config : `Config` 

190 A `Config` to update. Only the subset understood by 

191 this component will be updated. Will not expand 

192 defaults. 

193 full : `Config` 

194 A complete config with all defaults expanded that can be 

195 converted to a `DatastoreConfig`. Read-only and will not be 

196 modified by this method. 

197 Repository-specific options that should not be obtained 

198 from defaults when Butler instances are constructed 

199 should be copied from ``full`` to ``config``. 

200 overwrite : `bool`, optional 

201 If `False`, do not modify a value in ``config`` if the value 

202 already exists. Default is always to overwrite with the provided 

203 ``root``. 

204 

205 Notes 

206 ----- 

207 If a keyword is explicitly defined in the supplied ``config`` it 

208 will not be overridden by this method if ``overwrite`` is `False`. 

209 This allows explicit values set in external configs to be retained. 

210 """ 

211 Config.updateParameters( 

212 DatastoreConfig, 

213 config, 

214 full, 

215 toUpdate={"root": root}, 

216 toCopy=("cls", ("records", "table")), 

217 overwrite=overwrite, 

218 ) 

219 

220 @classmethod 

221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

222 return ddl.TableSpec( 

223 fields=[ 

224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

228 # Use empty string to indicate no component 

229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

230 # TODO: should checksum be Base64Bytes instead? 

231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

233 ], 

234 unique=frozenset(), 

235 indexes=[tuple(["path"])], 

236 ) 

237 

238 def __init__( 

239 self, 

240 config: Union[DatastoreConfig, str], 

241 bridgeManager: DatastoreRegistryBridgeManager, 

242 butlerRoot: str = None, 

243 ): 

244 super().__init__(config, bridgeManager) 

245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 raise ValueError("No root directory specified in configuration") 

247 

248 # Name ourselves either using an explicit name or a name 

249 # derived from the (unexpanded) root 

250 if "name" in self.config: 

251 self.name = self.config["name"] 

252 else: 

253 # We use the unexpanded root in the name to indicate that this 

254 # datastore can be moved without having to update registry. 

255 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

256 

257 # Support repository relocation in config 

258 # Existence of self.root is checked in subclass 

259 self.root = ButlerURI( 

260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

261 ) 

262 

263 self.locationFactory = LocationFactory(self.root) 

264 self.formatterFactory = FormatterFactory() 

265 

266 # Now associate formatters with storage classes 

267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

268 

269 # Read the file naming templates 

270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

271 

272 # See if composites should be disassembled 

273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

274 

275 tableName = self.config["records", "table"] 

276 try: 

277 # Storage of paths and formatters, keyed by dataset_id 

278 self._table = bridgeManager.opaque.register( 

279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

280 ) 

281 # Interface to Registry. 

282 self._bridge = bridgeManager.register(self.name) 

283 except ReadOnlyDatabaseError: 

284 # If the database is read only and we just tried and failed to 

285 # create a table, it means someone is trying to create a read-only 

286 # butler client for an empty repo. That should be okay, as long 

287 # as they then try to get any datasets before some other client 

288 # creates the table. Chances are they'rejust validating 

289 # configuration. 

290 pass 

291 

292 # Determine whether checksums should be used - default to False 

293 self.useChecksum = self.config.get("checksum", False) 

294 

295 # Determine whether we can fall back to configuration if a 

296 # requested dataset is not known to registry 

297 self.trustGetRequest = self.config.get("trust_get_request", False) 

298 

299 # Create a cache manager 

300 self.cacheManager: AbstractDatastoreCacheManager 

301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false

302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

303 else: 

304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

305 

306 # Check existence and create directory structure if necessary 

307 if not self.root.exists(): 

308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true

309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

310 try: 

311 self.root.mkdir() 

312 except Exception as e: 

313 raise ValueError( 

314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

315 ) from e 

316 

317 def __str__(self) -> str: 

318 return str(self.root) 

319 

320 @property 

321 def bridge(self) -> DatastoreRegistryBridge: 

322 return self._bridge 

323 

324 def _artifact_exists(self, location: Location) -> bool: 

325 """Check that an artifact exists in this datastore at the specified 

326 location. 

327 

328 Parameters 

329 ---------- 

330 location : `Location` 

331 Expected location of the artifact associated with this datastore. 

332 

333 Returns 

334 ------- 

335 exists : `bool` 

336 True if the location can be found, false otherwise. 

337 """ 

338 log.debug("Checking if resource exists: %s", location.uri) 

339 return location.uri.exists() 

340 

341 def _delete_artifact(self, location: Location) -> None: 

342 """Delete the artifact from the datastore. 

343 

344 Parameters 

345 ---------- 

346 location : `Location` 

347 Location of the artifact associated with this datastore. 

348 """ 

349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true

350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

351 

352 try: 

353 location.uri.remove() 

354 except FileNotFoundError: 

355 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

356 raise 

357 except Exception as e: 

358 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

359 raise 

360 log.debug("Successfully deleted file: %s", location.uri) 

361 

362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

363 # Docstring inherited from GenericBaseDatastore 

364 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

365 self._table.insert(*records) 

366 

367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

368 # Docstring inherited from GenericBaseDatastore 

369 

370 # Look for the dataset_id -- there might be multiple matches 

371 # if we have disassembled the dataset. 

372 records = self._table.fetch(dataset_id=ref.id) 

373 return [StoredFileInfo.from_record(record) for record in records] 

374 

375 def _get_stored_records_associated_with_refs( 

376 self, refs: Iterable[DatasetIdRef] 

377 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

378 """Retrieve all records associated with the provided refs. 

379 

380 Parameters 

381 ---------- 

382 refs : iterable of `DatasetIdRef` 

383 The refs for which records are to be retrieved. 

384 

385 Returns 

386 ------- 

387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

388 The matching records indexed by the ref ID. The number of entries 

389 in the dict can be smaller than the number of requested refs. 

390 """ 

391 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

392 

393 # Uniqueness is dataset_id + component so can have multiple records 

394 # per ref. 

395 records_by_ref = defaultdict(list) 

396 for record in records: 

397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

398 return records_by_ref 

399 

400 def _refs_associated_with_artifacts( 

401 self, paths: List[Union[str, ButlerURI]] 

402 ) -> Dict[str, Set[DatasetId]]: 

403 """Return paths and associated dataset refs. 

404 

405 Parameters 

406 ---------- 

407 paths : `list` of `str` or `ButlerURI` 

408 All the paths to include in search. 

409 

410 Returns 

411 ------- 

412 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

413 Mapping of each path to a set of associated database IDs. 

414 """ 

415 records = self._table.fetch(path=[str(path) for path in paths]) 

416 result = defaultdict(set) 

417 for row in records: 

418 result[row["path"]].add(row["dataset_id"]) 

419 return result 

420 

421 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

422 """Return all dataset refs associated with the supplied path. 

423 

424 Parameters 

425 ---------- 

426 pathInStore : `ButlerURI` 

427 Path of interest in the data store. 

428 

429 Returns 

430 ------- 

431 ids : `set` of `int` 

432 All `DatasetRef` IDs associated with this path. 

433 """ 

434 records = list(self._table.fetch(path=str(pathInStore))) 

435 ids = {r["dataset_id"] for r in records} 

436 return ids 

437 

438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

439 # Docstring inherited from GenericBaseDatastore 

440 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

441 

442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

443 r"""Find all the `Location`\ s of the requested dataset in the 

444 `Datastore` and the associated stored file information. 

445 

446 Parameters 

447 ---------- 

448 ref : `DatasetRef` 

449 Reference to the required `Dataset`. 

450 

451 Returns 

452 ------- 

453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

454 Location of the dataset within the datastore and 

455 stored information about each file and its formatter. 

456 """ 

457 # Get the file information (this will fail if no file) 

458 records = self.getStoredItemsInfo(ref) 

459 

460 # Use the path to determine the location -- we need to take 

461 # into account absolute URIs in the datastore record 

462 return [(r.file_location(self.locationFactory), r) for r in records] 

463 

464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

465 """Check that there is only one dataset associated with the 

466 specified artifact. 

467 

468 Parameters 

469 ---------- 

470 ref : `DatasetRef` or `FakeDatasetRef` 

471 Dataset to be removed. 

472 location : `Location` 

473 The location of the artifact to be removed. 

474 

475 Returns 

476 ------- 

477 can_remove : `Bool` 

478 True if the artifact can be safely removed. 

479 """ 

480 # Can't ever delete absolute URIs. 

481 if location.pathInStore.isabs(): 

482 return False 

483 

484 # Get all entries associated with this path 

485 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

486 if not allRefs: 

487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

488 

489 # Remove these refs from all the refs and if there is nothing left 

490 # then we can delete 

491 remainingRefs = allRefs - {ref.id} 

492 

493 if remainingRefs: 

494 return False 

495 return True 

496 

497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

498 """Predict the location and related file information of the requested 

499 dataset in this datastore. 

500 

501 Parameters 

502 ---------- 

503 ref : `DatasetRef` 

504 Reference to the required `Dataset`. 

505 

506 Returns 

507 ------- 

508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

509 Expected Location of the dataset within the datastore and 

510 placeholder information about each file and its formatter. 

511 

512 Notes 

513 ----- 

514 Uses the current configuration to determine how we would expect the 

515 datastore files to have been written if we couldn't ask registry. 

516 This is safe so long as there has been no change to datastore 

517 configuration between writing the dataset and wanting to read it. 

518 Will not work for files that have been ingested without using the 

519 standard file template or default formatter. 

520 """ 

521 

522 # If we have a component ref we always need to ask the questions 

523 # of the composite. If the composite is disassembled this routine 

524 # should return all components. If the composite was not 

525 # disassembled the composite is what is stored regardless of 

526 # component request. Note that if the caller has disassembled 

527 # a composite there is no way for this guess to know that 

528 # without trying both the composite and component ref and seeing 

529 # if there is something at the component Location even without 

530 # disassembly being enabled. 

531 if ref.datasetType.isComponent(): 

532 ref = ref.makeCompositeRef() 

533 

534 # See if the ref is a composite that should be disassembled 

535 doDisassembly = self.composites.shouldBeDisassembled(ref) 

536 

537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

538 

539 if doDisassembly: 

540 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

541 compRef = ref.makeComponentRef(component) 

542 location, formatter = self._determine_put_formatter_location(compRef) 

543 all_info.append((location, formatter, componentStorage, component)) 

544 

545 else: 

546 # Always use the composite ref if no disassembly 

547 location, formatter = self._determine_put_formatter_location(ref) 

548 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

549 

550 # Convert the list of tuples to have StoredFileInfo as second element 

551 return [ 

552 ( 

553 location, 

554 StoredFileInfo( 

555 formatter=formatter, 

556 path=location.pathInStore.path, 

557 storageClass=storageClass, 

558 component=component, 

559 checksum=None, 

560 file_size=-1, 

561 ), 

562 ) 

563 for location, formatter, storageClass, component in all_info 

564 ] 

565 

566 def _prepare_for_get( 

567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

568 ) -> List[DatastoreFileGetInformation]: 

569 """Check parameters for ``get`` and obtain formatter and 

570 location. 

571 

572 Parameters 

573 ---------- 

574 ref : `DatasetRef` 

575 Reference to the required Dataset. 

576 parameters : `dict` 

577 `StorageClass`-specific parameters that specify, for example, 

578 a slice of the dataset to be loaded. 

579 

580 Returns 

581 ------- 

582 getInfo : `list` [`DatastoreFileGetInformation`] 

583 Parameters needed to retrieve each file. 

584 """ 

585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

586 

587 # Get file metadata and internal metadata 

588 fileLocations = self._get_dataset_locations_info(ref) 

589 if not fileLocations: 

590 if not self.trustGetRequest: 

591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

592 # Assume the dataset is where we think it should be 

593 fileLocations = self._get_expected_dataset_locations_info(ref) 

594 

595 # The storage class we want to use eventually 

596 refStorageClass = ref.datasetType.storageClass 

597 

598 if len(fileLocations) > 1: 

599 disassembled = True 

600 

601 # If trust is involved it is possible that there will be 

602 # components listed here that do not exist in the datastore. 

603 # Explicitly check for file artifact existence and filter out any 

604 # that are missing. 

605 if self.trustGetRequest: 

606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

607 

608 # For now complain only if we have no components at all. One 

609 # component is probably a problem but we can punt that to the 

610 # assembler. 

611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true

612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

613 

614 else: 

615 disassembled = False 

616 

617 # Is this a component request? 

618 refComponent = ref.datasetType.component() 

619 

620 fileGetInfo = [] 

621 for location, storedFileInfo in fileLocations: 

622 

623 # The storage class used to write the file 

624 writeStorageClass = storedFileInfo.storageClass 

625 

626 # If this has been disassembled we need read to match the write 

627 if disassembled: 

628 readStorageClass = writeStorageClass 

629 else: 

630 readStorageClass = refStorageClass 

631 

632 formatter = get_instance_of( 

633 storedFileInfo.formatter, 

634 FileDescriptor( 

635 location, 

636 readStorageClass=readStorageClass, 

637 storageClass=writeStorageClass, 

638 parameters=parameters, 

639 ), 

640 ref.dataId, 

641 ) 

642 

643 formatterParams, notFormatterParams = formatter.segregateParameters() 

644 

645 # Of the remaining parameters, extract the ones supported by 

646 # this StorageClass (for components not all will be handled) 

647 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

648 

649 # The ref itself could be a component if the dataset was 

650 # disassembled by butler, or we disassembled in datastore and 

651 # components came from the datastore records 

652 component = storedFileInfo.component if storedFileInfo.component else refComponent 

653 

654 fileGetInfo.append( 

655 DatastoreFileGetInformation( 

656 location, 

657 formatter, 

658 storedFileInfo, 

659 assemblerParams, 

660 formatterParams, 

661 component, 

662 readStorageClass, 

663 ) 

664 ) 

665 

666 return fileGetInfo 

667 

668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

669 """Check the arguments for ``put`` and obtain formatter and 

670 location. 

671 

672 Parameters 

673 ---------- 

674 inMemoryDataset : `object` 

675 The dataset to store. 

676 ref : `DatasetRef` 

677 Reference to the associated Dataset. 

678 

679 Returns 

680 ------- 

681 location : `Location` 

682 The location to write the dataset. 

683 formatter : `Formatter` 

684 The `Formatter` to use to write the dataset. 

685 

686 Raises 

687 ------ 

688 TypeError 

689 Supplied object and storage class are inconsistent. 

690 DatasetTypeNotSupportedError 

691 The associated `DatasetType` is not handled by this datastore. 

692 """ 

693 self._validate_put_parameters(inMemoryDataset, ref) 

694 return self._determine_put_formatter_location(ref) 

695 

696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

697 """Calculate the formatter and output location to use for put. 

698 

699 Parameters 

700 ---------- 

701 ref : `DatasetRef` 

702 Reference to the associated Dataset. 

703 

704 Returns 

705 ------- 

706 location : `Location` 

707 The location to write the dataset. 

708 formatter : `Formatter` 

709 The `Formatter` to use to write the dataset. 

710 """ 

711 # Work out output file name 

712 try: 

713 template = self.templates.getTemplate(ref) 

714 except KeyError as e: 

715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

716 

717 # Validate the template to protect against filenames from different 

718 # dataIds returning the same and causing overwrite confusion. 

719 template.validateTemplate(ref) 

720 

721 location = self.locationFactory.fromPath(template.format(ref)) 

722 

723 # Get the formatter based on the storage class 

724 storageClass = ref.datasetType.storageClass 

725 try: 

726 formatter = self.formatterFactory.getFormatter( 

727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

728 ) 

729 except KeyError as e: 

730 raise DatasetTypeNotSupportedError( 

731 f"Unable to find formatter for {ref} in datastore {self.name}" 

732 ) from e 

733 

734 # Now that we know the formatter, update the location 

735 location = formatter.makeUpdatedLocation(location) 

736 

737 return location, formatter 

738 

739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

740 # Docstring inherited from base class 

741 if transfer != "auto": 

742 return transfer 

743 

744 # See if the paths are within the datastore or not 

745 inside = [self._pathInStore(d.path) is not None for d in datasets] 

746 

747 if all(inside): 

748 transfer = None 

749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false

750 # Allow ButlerURI to use its own knowledge 

751 transfer = "auto" 

752 else: 

753 # This can happen when importing from a datastore that 

754 # has had some datasets ingested using "direct" mode. 

755 # Also allow ButlerURI to sort it out but warn about it. 

756 # This can happen if you are importing from a datastore 

757 # that had some direct transfer datasets. 

758 log.warning( 

759 "Some datasets are inside the datastore and some are outside. Using 'split' " 

760 "transfer mode. This assumes that the files outside the datastore are " 

761 "still accessible to the new butler since they will not be copied into " 

762 "the target datastore." 

763 ) 

764 transfer = "split" 

765 

766 return transfer 

767 

768 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

769 """Return path relative to datastore root 

770 

771 Parameters 

772 ---------- 

773 path : `str` or `ButlerURI` 

774 Path to dataset. Can be absolute URI. If relative assumed to 

775 be relative to the datastore. Returns path in datastore 

776 or raises an exception if the path it outside. 

777 

778 Returns 

779 ------- 

780 inStore : `str` 

781 Path relative to datastore root. Returns `None` if the file is 

782 outside the root. 

783 """ 

784 # Relative path will always be relative to datastore 

785 pathUri = ButlerURI(path, forceAbsolute=False) 

786 return pathUri.relative_to(self.root) 

787 

788 def _standardizeIngestPath( 

789 self, path: Union[str, ButlerURI], *, transfer: Optional[str] = None 

790 ) -> Union[str, ButlerURI]: 

791 """Standardize the path of a to-be-ingested file. 

792 

793 Parameters 

794 ---------- 

795 path : `str` or `ButlerURI` 

796 Path of a file to be ingested. 

797 transfer : `str`, optional 

798 How (and whether) the dataset should be added to the datastore. 

799 See `ingest` for details of transfer modes. 

800 This implementation is provided only so 

801 `NotImplementedError` can be raised if the mode is not supported; 

802 actual transfers are deferred to `_extractIngestInfo`. 

803 

804 Returns 

805 ------- 

806 path : `str` or `ButlerURI` 

807 New path in what the datastore considers standard form. If an 

808 absolute URI was given that will be returned unchanged. 

809 

810 Notes 

811 ----- 

812 Subclasses of `FileDatastore` can implement this method instead 

813 of `_prepIngest`. It should not modify the data repository or given 

814 file in any way. 

815 

816 Raises 

817 ------ 

818 NotImplementedError 

819 Raised if the datastore does not support the given transfer mode 

820 (including the case where ingest is not supported at all). 

821 FileNotFoundError 

822 Raised if one of the given files does not exist. 

823 """ 

824 if transfer not in (None, "direct", "split") + self.root.transferModes: 824 ↛ 825line 824 didn't jump to line 825, because the condition on line 824 was never true

825 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

826 

827 # A relative URI indicates relative to datastore root 

828 srcUri = ButlerURI(path, forceAbsolute=False) 

829 if not srcUri.isabs(): 

830 srcUri = self.root.join(path) 

831 

832 if not srcUri.exists(): 

833 raise FileNotFoundError( 

834 f"Resource at {srcUri} does not exist; note that paths to ingest " 

835 f"are assumed to be relative to {self.root} unless they are absolute." 

836 ) 

837 

838 if transfer is None: 

839 relpath = srcUri.relative_to(self.root) 

840 if not relpath: 

841 raise RuntimeError( 

842 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

843 ) 

844 

845 # Return the relative path within the datastore for internal 

846 # transfer 

847 path = relpath 

848 

849 return path 

850 

851 def _extractIngestInfo( 

852 self, 

853 path: Union[str, ButlerURI], 

854 ref: DatasetRef, 

855 *, 

856 formatter: Union[Formatter, Type[Formatter]], 

857 transfer: Optional[str] = None, 

858 ) -> StoredFileInfo: 

859 """Relocate (if necessary) and extract `StoredFileInfo` from a 

860 to-be-ingested file. 

861 

862 Parameters 

863 ---------- 

864 path : `str` or `ButlerURI` 

865 URI or path of a file to be ingested. 

866 ref : `DatasetRef` 

867 Reference for the dataset being ingested. Guaranteed to have 

868 ``dataset_id not None`. 

869 formatter : `type` or `Formatter` 

870 `Formatter` subclass to use for this dataset or an instance. 

871 transfer : `str`, optional 

872 How (and whether) the dataset should be added to the datastore. 

873 See `ingest` for details of transfer modes. 

874 

875 Returns 

876 ------- 

877 info : `StoredFileInfo` 

878 Internal datastore record for this file. This will be inserted by 

879 the caller; the `_extractIngestInfo` is only responsible for 

880 creating and populating the struct. 

881 

882 Raises 

883 ------ 

884 FileNotFoundError 

885 Raised if one of the given files does not exist. 

886 FileExistsError 

887 Raised if transfer is not `None` but the (internal) location the 

888 file would be moved to is already occupied. 

889 """ 

890 if self._transaction is None: 890 ↛ 891line 890 didn't jump to line 891, because the condition on line 890 was never true

891 raise RuntimeError("Ingest called without transaction enabled") 

892 

893 # Create URI of the source path, do not need to force a relative 

894 # path to absolute. 

895 srcUri = ButlerURI(path, forceAbsolute=False) 

896 

897 # Track whether we have read the size of the source yet 

898 have_sized = False 

899 

900 tgtLocation: Optional[Location] 

901 if transfer is None or transfer == "split": 

902 # A relative path is assumed to be relative to the datastore 

903 # in this context 

904 if not srcUri.isabs(): 

905 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

906 else: 

907 # Work out the path in the datastore from an absolute URI 

908 # This is required to be within the datastore. 

909 pathInStore = srcUri.relative_to(self.root) 

910 if pathInStore is None and transfer is None: 910 ↛ 911line 910 didn't jump to line 911, because the condition on line 910 was never true

911 raise RuntimeError( 

912 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

913 ) 

914 if pathInStore: 914 ↛ 916line 914 didn't jump to line 916, because the condition on line 914 was never false

915 tgtLocation = self.locationFactory.fromPath(pathInStore) 

916 elif transfer == "split": 

917 # Outside the datastore but treat that as a direct ingest 

918 # instead. 

919 tgtLocation = None 

920 else: 

921 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

922 elif transfer == "direct": 922 ↛ 927line 922 didn't jump to line 927, because the condition on line 922 was never true

923 # Want to store the full URI to the resource directly in 

924 # datastore. This is useful for referring to permanent archive 

925 # storage for raw data. 

926 # Trust that people know what they are doing. 

927 tgtLocation = None 

928 else: 

929 # Work out the name we want this ingested file to have 

930 # inside the datastore 

931 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

932 if not tgtLocation.uri.dirname().exists(): 

933 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

934 tgtLocation.uri.dirname().mkdir() 

935 

936 # if we are transferring from a local file to a remote location 

937 # it may be more efficient to get the size and checksum of the 

938 # local file rather than the transferred one 

939 if not srcUri.scheme or srcUri.scheme == "file": 939 ↛ 949line 939 didn't jump to line 949, because the condition on line 939 was never false

940 size = srcUri.size() 

941 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

942 have_sized = True 

943 

944 # Transfer the resource to the destination. 

945 # Allow overwrite of an existing file. This matches the behavior 

946 # of datastore.put() in that it trusts that registry would not 

947 # be asking to overwrite unless registry thought that the 

948 # overwrite was allowed. 

949 tgtLocation.uri.transfer_from( 

950 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

951 ) 

952 

953 if tgtLocation is None: 953 ↛ 955line 953 didn't jump to line 955, because the condition on line 953 was never true

954 # This means we are using direct mode 

955 targetUri = srcUri 

956 targetPath = str(srcUri) 

957 else: 

958 targetUri = tgtLocation.uri 

959 targetPath = tgtLocation.pathInStore.path 

960 

961 # the file should exist in the datastore now 

962 if not have_sized: 

963 size = targetUri.size() 

964 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

965 

966 return StoredFileInfo( 

967 formatter=formatter, 

968 path=targetPath, 

969 storageClass=ref.datasetType.storageClass, 

970 component=ref.datasetType.component(), 

971 file_size=size, 

972 checksum=checksum, 

973 ) 

974 

975 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

976 # Docstring inherited from Datastore._prepIngest. 

977 filtered = [] 

978 for dataset in datasets: 

979 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

980 if not acceptable: 

981 continue 

982 else: 

983 dataset.refs = acceptable 

984 if dataset.formatter is None: 

985 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

986 else: 

987 assert isinstance(dataset.formatter, (type, str)) 

988 formatter_class = get_class_of(dataset.formatter) 

989 if not issubclass(formatter_class, Formatter): 989 ↛ 990line 989 didn't jump to line 990, because the condition on line 989 was never true

990 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

991 dataset.formatter = formatter_class 

992 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

993 filtered.append(dataset) 

994 return _IngestPrepData(filtered) 

995 

996 @transactional 

997 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

998 # Docstring inherited from Datastore._finishIngest. 

999 refsAndInfos = [] 

1000 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1001 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1002 # Do ingest as if the first dataset ref is associated with the file 

1003 info = self._extractIngestInfo( 

1004 dataset.path, dataset.refs[0], formatter=dataset.formatter, transfer=transfer 

1005 ) 

1006 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1007 self._register_datasets(refsAndInfos) 

1008 

1009 def _calculate_ingested_datastore_name( 

1010 self, srcUri: ButlerURI, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1011 ) -> Location: 

1012 """Given a source URI and a DatasetRef, determine the name the 

1013 dataset will have inside datastore. 

1014 

1015 Parameters 

1016 ---------- 

1017 srcUri : `ButlerURI` 

1018 URI to the source dataset file. 

1019 ref : `DatasetRef` 

1020 Ref associated with the newly-ingested dataset artifact. This 

1021 is used to determine the name within the datastore. 

1022 formatter : `Formatter` or Formatter class. 

1023 Formatter to use for validation. Can be a class or an instance. 

1024 

1025 Returns 

1026 ------- 

1027 location : `Location` 

1028 Target location for the newly-ingested dataset. 

1029 """ 

1030 # Ingesting a file from outside the datastore. 

1031 # This involves a new name. 

1032 template = self.templates.getTemplate(ref) 

1033 location = self.locationFactory.fromPath(template.format(ref)) 

1034 

1035 # Get the extension 

1036 ext = srcUri.getExtension() 

1037 

1038 # Update the destination to include that extension 

1039 location.updateExtension(ext) 

1040 

1041 # Ask the formatter to validate this extension 

1042 formatter.validateExtension(location) 

1043 

1044 return location 

1045 

1046 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1047 """Write out in memory dataset to datastore. 

1048 

1049 Parameters 

1050 ---------- 

1051 inMemoryDataset : `object` 

1052 Dataset to write to datastore. 

1053 ref : `DatasetRef` 

1054 Registry information associated with this dataset. 

1055 

1056 Returns 

1057 ------- 

1058 info : `StoredFileInfo` 

1059 Information describing the artifact written to the datastore. 

1060 """ 

1061 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1062 uri = location.uri 

1063 

1064 if not uri.dirname().exists(): 

1065 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1066 uri.dirname().mkdir() 

1067 

1068 if self._transaction is None: 1068 ↛ 1069line 1068 didn't jump to line 1069, because the condition on line 1068 was never true

1069 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1070 

1071 def _removeFileExists(uri: ButlerURI) -> None: 

1072 """Remove a file and do not complain if it is not there. 

1073 

1074 This is important since a formatter might fail before the file 

1075 is written and we should not confuse people by writing spurious 

1076 error messages to the log. 

1077 """ 

1078 try: 

1079 uri.remove() 

1080 except FileNotFoundError: 

1081 pass 

1082 

1083 # Register a callback to try to delete the uploaded data if 

1084 # something fails below 

1085 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1086 

1087 # For a local file, simply use the formatter directly 

1088 if uri.isLocal: 

1089 try: 

1090 formatter.write(inMemoryDataset) 

1091 except Exception as e: 

1092 raise RuntimeError( 

1093 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}" 

1094 ) from e 

1095 log.debug("Successfully wrote python object to local file at %s", uri) 

1096 else: 

1097 # This is a remote URI. Some datasets can be serialized directly 

1098 # to bytes and sent to the remote datastore without writing a 

1099 # file. If the dataset is intended to be saved to the cache 

1100 # a file is always written and direct write to the remote 

1101 # datastore is bypassed. 

1102 data_written = False 

1103 if not self.cacheManager.should_be_cached(ref): 

1104 try: 

1105 serializedDataset = formatter.toBytes(inMemoryDataset) 

1106 except NotImplementedError: 

1107 # Fallback to the file writing option. 

1108 pass 

1109 except Exception as e: 

1110 raise RuntimeError( 

1111 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1112 ) from e 

1113 else: 

1114 log.debug("Writing bytes directly to %s", uri) 

1115 uri.write(serializedDataset, overwrite=True) 

1116 log.debug("Successfully wrote bytes directly to %s", uri) 

1117 data_written = True 

1118 

1119 if not data_written: 

1120 # Did not write the bytes directly to object store so instead 

1121 # write to temporary file. 

1122 with ButlerURI.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1123 # Need to configure the formatter to write to a different 

1124 # location and that needs us to overwrite internals 

1125 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1126 with formatter._updateLocation(Location(None, temporary_uri)): 

1127 try: 

1128 formatter.write(inMemoryDataset) 

1129 except Exception as e: 

1130 raise RuntimeError( 

1131 f"Failed to serialize dataset {ref} of type" 

1132 f" {type(inMemoryDataset)} to " 

1133 f"temporary location {temporary_uri}" 

1134 ) from e 

1135 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1136 

1137 # Cache if required 

1138 self.cacheManager.move_to_cache(temporary_uri, ref) 

1139 

1140 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1141 

1142 # URI is needed to resolve what ingest case are we dealing with 

1143 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1144 

1145 def _read_artifact_into_memory( 

1146 self, 

1147 getInfo: DatastoreFileGetInformation, 

1148 ref: DatasetRef, 

1149 isComponent: bool = False, 

1150 cache_ref: Optional[DatasetRef] = None, 

1151 ) -> Any: 

1152 """Read the artifact from datastore into in memory object. 

1153 

1154 Parameters 

1155 ---------- 

1156 getInfo : `DatastoreFileGetInformation` 

1157 Information about the artifact within the datastore. 

1158 ref : `DatasetRef` 

1159 The registry information associated with this artifact. 

1160 isComponent : `bool` 

1161 Flag to indicate if a component is being read from this artifact. 

1162 cache_ref : `DatasetRef`, optional 

1163 The DatasetRef to use when looking up the file in the cache. 

1164 This ref must have the same ID as the supplied ref but can 

1165 be a parent ref or component ref to indicate to the cache whether 

1166 a composite file is being requested from the cache or a component 

1167 file. Without this the cache will default to the supplied ref but 

1168 it can get confused with read-only derived components for 

1169 disassembled composites. 

1170 

1171 Returns 

1172 ------- 

1173 inMemoryDataset : `object` 

1174 The artifact as a python object. 

1175 """ 

1176 location = getInfo.location 

1177 uri = location.uri 

1178 log.debug("Accessing data from %s", uri) 

1179 

1180 if cache_ref is None: 

1181 cache_ref = ref 

1182 if cache_ref.id != ref.id: 1182 ↛ 1183line 1182 didn't jump to line 1183, because the condition on line 1182 was never true

1183 raise ValueError( 

1184 "The supplied cache dataset ref refers to a different dataset than expected:" 

1185 f" {ref.id} != {cache_ref.id}" 

1186 ) 

1187 

1188 # Cannot recalculate checksum but can compare size as a quick check 

1189 # Do not do this if the size is negative since that indicates 

1190 # we do not know. 

1191 recorded_size = getInfo.info.file_size 

1192 resource_size = uri.size() 

1193 if recorded_size >= 0 and resource_size != recorded_size: 1193 ↛ 1194line 1193 didn't jump to line 1194, because the condition on line 1193 was never true

1194 raise RuntimeError( 

1195 "Integrity failure in Datastore. " 

1196 f"Size of file {uri} ({resource_size}) " 

1197 f"does not match size recorded in registry of {recorded_size}" 

1198 ) 

1199 

1200 # For the general case we have choices for how to proceed. 

1201 # 1. Always use a local file (downloading the remote resource to a 

1202 # temporary file if needed). 

1203 # 2. Use a threshold size and read into memory and use bytes. 

1204 # Use both for now with an arbitrary hand off size. 

1205 # This allows small datasets to be downloaded from remote object 

1206 # stores without requiring a temporary file. 

1207 

1208 formatter = getInfo.formatter 

1209 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1210 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1211 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1212 if cached_file is not None: 

1213 desired_uri = cached_file 

1214 msg = f" (cached version of {uri})" 

1215 else: 

1216 desired_uri = uri 

1217 msg = "" 

1218 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1219 serializedDataset = desired_uri.read() 

1220 log.debug( 

1221 "Deserializing %s from %d bytes from location %s with formatter %s", 

1222 f"component {getInfo.component}" if isComponent else "", 

1223 len(serializedDataset), 

1224 uri, 

1225 formatter.name(), 

1226 ) 

1227 try: 

1228 result = formatter.fromBytes( 

1229 serializedDataset, component=getInfo.component if isComponent else None 

1230 ) 

1231 except Exception as e: 

1232 raise ValueError( 

1233 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1234 f" ({ref.datasetType.name} from {uri}): {e}" 

1235 ) from e 

1236 else: 

1237 # Read from file. 

1238 

1239 # Have to update the Location associated with the formatter 

1240 # because formatter.read does not allow an override. 

1241 # This could be improved. 

1242 location_updated = False 

1243 msg = "" 

1244 

1245 # First check in cache for local version. 

1246 # The cache will only be relevant for remote resources but 

1247 # no harm in always asking. Context manager ensures that cache 

1248 # file is not deleted during cache expiration. 

1249 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1250 if cached_file is not None: 

1251 msg = f"(via cache read of remote file {uri})" 

1252 uri = cached_file 

1253 location_updated = True 

1254 

1255 with uri.as_local() as local_uri: 

1256 

1257 can_be_cached = False 

1258 if uri != local_uri: 1258 ↛ 1260line 1258 didn't jump to line 1260, because the condition on line 1258 was never true

1259 # URI was remote and file was downloaded 

1260 cache_msg = "" 

1261 location_updated = True 

1262 

1263 if self.cacheManager.should_be_cached(cache_ref): 

1264 # In this scenario we want to ask if the downloaded 

1265 # file should be cached but we should not cache 

1266 # it until after we've used it (to ensure it can't 

1267 # be expired whilst we are using it). 

1268 can_be_cached = True 

1269 

1270 # Say that it is "likely" to be cached because 

1271 # if the formatter read fails we will not be 

1272 # caching this file. 

1273 cache_msg = " and likely cached" 

1274 

1275 msg = f"(via download to local file{cache_msg})" 

1276 

1277 # Calculate the (possibly) new location for the formatter 

1278 # to use. 

1279 newLocation = Location(*local_uri.split()) if location_updated else None 

1280 

1281 log.debug( 

1282 "Reading%s from location %s %s with formatter %s", 

1283 f" component {getInfo.component}" if isComponent else "", 

1284 uri, 

1285 msg, 

1286 formatter.name(), 

1287 ) 

1288 try: 

1289 with formatter._updateLocation(newLocation): 

1290 with time_this( 

1291 log, 

1292 msg="Reading%s from location %s %s with formatter %s", 

1293 args=( 

1294 f" component {getInfo.component}" if isComponent else "", 

1295 uri, 

1296 msg, 

1297 formatter.name(), 

1298 ), 

1299 ): 

1300 result = formatter.read(component=getInfo.component if isComponent else None) 

1301 except Exception as e: 

1302 raise ValueError( 

1303 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1304 f" ({ref.datasetType.name} from {uri}): {e}" 

1305 ) from e 

1306 

1307 # File was read successfully so can move to cache 

1308 if can_be_cached: 1308 ↛ 1309line 1308 didn't jump to line 1309, because the condition on line 1308 was never true

1309 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1310 

1311 return self._post_process_get( 

1312 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1313 ) 

1314 

1315 def knows(self, ref: DatasetRef) -> bool: 

1316 """Check if the dataset is known to the datastore. 

1317 

1318 Does not check for existence of any artifact. 

1319 

1320 Parameters 

1321 ---------- 

1322 ref : `DatasetRef` 

1323 Reference to the required dataset. 

1324 

1325 Returns 

1326 ------- 

1327 exists : `bool` 

1328 `True` if the dataset is known to the datastore. 

1329 """ 

1330 fileLocations = self._get_dataset_locations_info(ref) 

1331 if fileLocations: 

1332 return True 

1333 return False 

1334 

1335 def _process_mexists_records( 

1336 self, 

1337 id_to_ref: Dict[DatasetId, DatasetRef], 

1338 records: Dict[DatasetId, List[StoredFileInfo]], 

1339 all_required: bool, 

1340 artifact_existence: Optional[Dict[ButlerURI, bool]] = None, 

1341 ) -> Dict[DatasetRef, bool]: 

1342 """Helper function for mexists that checks the given records. 

1343 

1344 Parameters 

1345 ---------- 

1346 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1347 Mapping of the dataset ID to the dataset ref itself. 

1348 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1349 Records as generally returned by 

1350 ``_get_stored_records_associated_with_refs``. 

1351 all_required : `bool` 

1352 Flag to indicate whether existence requires all artifacts 

1353 associated with a dataset ID to exist or not for existence. 

1354 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional 

1355 Mapping of datastore artifact to existence. Updated by this 

1356 method with details of all artifacts tested. Can be `None` 

1357 if the caller is not interested. 

1358 

1359 Returns 

1360 ------- 

1361 existence : `dict` of [`DatasetRef`, `bool`] 

1362 Mapping from dataset to boolean indicating existence. 

1363 """ 

1364 # The URIs to be checked and a mapping of those URIs to 

1365 # the dataset ID. 

1366 uris_to_check: List[ButlerURI] = [] 

1367 location_map: Dict[ButlerURI, DatasetId] = {} 

1368 

1369 location_factory = self.locationFactory 

1370 

1371 for ref_id, info in records.items(): 

1372 # Key is the dataId, value is list of StoredItemInfo 

1373 uris = [info.file_location(location_factory).uri for info in info] 

1374 uris_to_check.extend(uris) 

1375 location_map.update({uri: ref_id for uri in uris}) 

1376 

1377 uri_existence: Dict[ButlerURI, bool] = {} 

1378 if artifact_existence is not None: 

1379 # If a URI has already been checked remove it from the list 

1380 # and immediately add the status to the output dict. 

1381 filtered_uris_to_check = [] 

1382 for uri in uris_to_check: 

1383 if uri in artifact_existence: 

1384 uri_existence[uri] = artifact_existence[uri] 

1385 else: 

1386 filtered_uris_to_check.append(uri) 

1387 uris_to_check = filtered_uris_to_check 

1388 

1389 # Results. 

1390 dataset_existence: Dict[DatasetRef, bool] = {} 

1391 

1392 uri_existence.update(ButlerURI.mexists(uris_to_check)) 

1393 for uri, exists in uri_existence.items(): 

1394 dataset_id = location_map[uri] 

1395 ref = id_to_ref[dataset_id] 

1396 

1397 # Disassembled composite needs to check all locations. 

1398 # all_required indicates whether all need to exist or not. 

1399 if ref in dataset_existence: 

1400 if all_required: 

1401 exists = dataset_existence[ref] and exists 

1402 else: 

1403 exists = dataset_existence[ref] or exists 

1404 dataset_existence[ref] = exists 

1405 

1406 if artifact_existence is not None: 

1407 artifact_existence.update(uri_existence) 

1408 

1409 return dataset_existence 

1410 

1411 def mexists( 

1412 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ButlerURI, bool]] = None 

1413 ) -> Dict[DatasetRef, bool]: 

1414 """Check the existence of multiple datasets at once. 

1415 

1416 Parameters 

1417 ---------- 

1418 refs : iterable of `DatasetRef` 

1419 The datasets to be checked. 

1420 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional 

1421 Mapping of datastore artifact to existence. Updated by this 

1422 method with details of all artifacts tested. Can be `None` 

1423 if the caller is not interested. 

1424 

1425 Returns 

1426 ------- 

1427 existence : `dict` of [`DatasetRef`, `bool`] 

1428 Mapping from dataset to boolean indicating existence. 

1429 """ 

1430 chunk_size = 10_000 

1431 dataset_existence: Dict[DatasetRef, bool] = {} 

1432 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1433 n_found_total = 0 

1434 n_checked = 0 

1435 n_chunks = 0 

1436 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1437 chunk_result = self._mexists(chunk, artifact_existence) 

1438 if log.isEnabledFor(VERBOSE): 

1439 n_results = len(chunk_result) 

1440 n_checked += n_results 

1441 # Can treat the booleans as 0, 1 integers and sum them. 

1442 n_found = sum(chunk_result.values()) 

1443 n_found_total += n_found 

1444 log.verbose( 

1445 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1446 n_chunks, 

1447 n_found, 

1448 n_results, 

1449 n_found_total, 

1450 n_checked, 

1451 ) 

1452 dataset_existence.update(chunk_result) 

1453 n_chunks += 1 

1454 

1455 return dataset_existence 

1456 

1457 def _mexists( 

1458 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ButlerURI, bool]] = None 

1459 ) -> Dict[DatasetRef, bool]: 

1460 """Check the existence of multiple datasets at once. 

1461 

1462 Parameters 

1463 ---------- 

1464 refs : iterable of `DatasetRef` 

1465 The datasets to be checked. 

1466 

1467 Returns 

1468 ------- 

1469 existence : `dict` of [`DatasetRef`, `bool`] 

1470 Mapping from dataset to boolean indicating existence. 

1471 """ 

1472 # Need a mapping of dataset_id to dataset ref since the API 

1473 # works with dataset_id 

1474 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1475 

1476 # Set of all IDs we are checking for. 

1477 requested_ids = set(id_to_ref.keys()) 

1478 

1479 # The records themselves. Could be missing some entries. 

1480 records = self._get_stored_records_associated_with_refs(refs) 

1481 

1482 dataset_existence = self._process_mexists_records( 

1483 id_to_ref, records, True, artifact_existence=artifact_existence 

1484 ) 

1485 

1486 # Set of IDs that have been handled. 

1487 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1488 

1489 missing_ids = requested_ids - handled_ids 

1490 if missing_ids: 

1491 if not self.trustGetRequest: 

1492 # Must assume these do not exist 

1493 for missing in missing_ids: 

1494 dataset_existence[id_to_ref[missing]] = False 

1495 else: 

1496 log.debug( 

1497 "%d out of %d datasets were not known to datastore during initial existence check.", 

1498 len(missing_ids), 

1499 len(requested_ids), 

1500 ) 

1501 

1502 # Construct data structure identical to that returned 

1503 # by _get_stored_records_associated_with_refs() but using 

1504 # guessed names. 

1505 records = {} 

1506 for missing in missing_ids: 

1507 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1508 records[missing] = [info for _, info in expected] 

1509 

1510 dataset_existence.update( 

1511 self._process_mexists_records( 

1512 id_to_ref, records, False, artifact_existence=artifact_existence 

1513 ) 

1514 ) 

1515 

1516 return dataset_existence 

1517 

1518 def exists(self, ref: DatasetRef) -> bool: 

1519 """Check if the dataset exists in the datastore. 

1520 

1521 Parameters 

1522 ---------- 

1523 ref : `DatasetRef` 

1524 Reference to the required dataset. 

1525 

1526 Returns 

1527 ------- 

1528 exists : `bool` 

1529 `True` if the entity exists in the `Datastore`. 

1530 """ 

1531 fileLocations = self._get_dataset_locations_info(ref) 

1532 

1533 # if we are being asked to trust that registry might not be correct 

1534 # we ask for the expected locations and check them explicitly 

1535 if not fileLocations: 

1536 if not self.trustGetRequest: 

1537 return False 

1538 

1539 # When we are guessing a dataset location we can not check 

1540 # for the existence of every component since we can not 

1541 # know if every component was written. Instead we check 

1542 # for the existence of any of the expected locations. 

1543 for location, _ in self._get_expected_dataset_locations_info(ref): 1543 ↛ 1546line 1543 didn't jump to line 1546, because the loop on line 1543 didn't complete

1544 if self._artifact_exists(location): 1544 ↛ 1543line 1544 didn't jump to line 1543, because the condition on line 1544 was never false

1545 return True 

1546 return False 

1547 

1548 # All listed artifacts must exist. 

1549 for location, _ in fileLocations: 

1550 if not self._artifact_exists(location): 

1551 return False 

1552 

1553 return True 

1554 

1555 def getURIs( 

1556 self, ref: DatasetRef, predict: bool = False 

1557 ) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1558 """Return URIs associated with dataset. 

1559 

1560 Parameters 

1561 ---------- 

1562 ref : `DatasetRef` 

1563 Reference to the required dataset. 

1564 predict : `bool`, optional 

1565 If the datastore does not know about the dataset, should it 

1566 return a predicted URI or not? 

1567 

1568 Returns 

1569 ------- 

1570 primary : `ButlerURI` 

1571 The URI to the primary artifact associated with this dataset. 

1572 If the dataset was disassembled within the datastore this 

1573 may be `None`. 

1574 components : `dict` 

1575 URIs to any components associated with the dataset artifact. 

1576 Can be empty if there are no components. 

1577 """ 

1578 

1579 primary: Optional[ButlerURI] = None 

1580 components: Dict[str, ButlerURI] = {} 

1581 

1582 # if this has never been written then we have to guess 

1583 if not self.exists(ref): 

1584 if not predict: 

1585 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1586 

1587 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1588 

1589 if doDisassembly: 

1590 

1591 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1592 compRef = ref.makeComponentRef(component) 

1593 compLocation, _ = self._determine_put_formatter_location(compRef) 

1594 

1595 # Add a URI fragment to indicate this is a guess 

1596 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1597 

1598 else: 

1599 

1600 location, _ = self._determine_put_formatter_location(ref) 

1601 

1602 # Add a URI fragment to indicate this is a guess 

1603 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1604 

1605 return primary, components 

1606 

1607 # If this is a ref that we have written we can get the path. 

1608 # Get file metadata and internal metadata 

1609 fileLocations = self._get_dataset_locations_info(ref) 

1610 

1611 guessing = False 

1612 if not fileLocations: 

1613 if not self.trustGetRequest: 1613 ↛ 1614line 1613 didn't jump to line 1614, because the condition on line 1613 was never true

1614 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1615 fileLocations = self._get_expected_dataset_locations_info(ref) 

1616 guessing = True 

1617 

1618 if len(fileLocations) == 1: 

1619 # No disassembly so this is the primary URI 

1620 uri = fileLocations[0][0].uri 

1621 if guessing and not uri.exists(): 1621 ↛ 1622line 1621 didn't jump to line 1622, because the condition on line 1621 was never true

1622 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1623 primary = uri 

1624 

1625 else: 

1626 for location, storedFileInfo in fileLocations: 

1627 if storedFileInfo.component is None: 1627 ↛ 1628line 1627 didn't jump to line 1628, because the condition on line 1627 was never true

1628 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1629 uri = location.uri 

1630 if guessing and not uri.exists(): 1630 ↛ 1634line 1630 didn't jump to line 1634, because the condition on line 1630 was never true

1631 # If we are trusting then it is entirely possible for 

1632 # some components to be missing. In that case we skip 

1633 # to the next component. 

1634 if self.trustGetRequest: 

1635 continue 

1636 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1637 components[storedFileInfo.component] = uri 

1638 

1639 return primary, components 

1640 

1641 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1642 """URI to the Dataset. 

1643 

1644 Parameters 

1645 ---------- 

1646 ref : `DatasetRef` 

1647 Reference to the required Dataset. 

1648 predict : `bool` 

1649 If `True`, allow URIs to be returned of datasets that have not 

1650 been written. 

1651 

1652 Returns 

1653 ------- 

1654 uri : `str` 

1655 URI pointing to the dataset within the datastore. If the 

1656 dataset does not exist in the datastore, and if ``predict`` is 

1657 `True`, the URI will be a prediction and will include a URI 

1658 fragment "#predicted". 

1659 If the datastore does not have entities that relate well 

1660 to the concept of a URI the returned URI will be 

1661 descriptive. The returned URI is not guaranteed to be obtainable. 

1662 

1663 Raises 

1664 ------ 

1665 FileNotFoundError 

1666 Raised if a URI has been requested for a dataset that does not 

1667 exist and guessing is not allowed. 

1668 RuntimeError 

1669 Raised if a request is made for a single URI but multiple URIs 

1670 are associated with this dataset. 

1671 

1672 Notes 

1673 ----- 

1674 When a predicted URI is requested an attempt will be made to form 

1675 a reasonable URI based on file templates and the expected formatter. 

1676 """ 

1677 primary, components = self.getURIs(ref, predict) 

1678 if primary is None or components: 1678 ↛ 1679line 1678 didn't jump to line 1679, because the condition on line 1678 was never true

1679 raise RuntimeError( 

1680 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1681 ) 

1682 return primary 

1683 

1684 def retrieveArtifacts( 

1685 self, 

1686 refs: Iterable[DatasetRef], 

1687 destination: ButlerURI, 

1688 transfer: str = "auto", 

1689 preserve_path: bool = True, 

1690 overwrite: bool = False, 

1691 ) -> List[ButlerURI]: 

1692 """Retrieve the file artifacts associated with the supplied refs. 

1693 

1694 Parameters 

1695 ---------- 

1696 refs : iterable of `DatasetRef` 

1697 The datasets for which file artifacts are to be retrieved. 

1698 A single ref can result in multiple files. The refs must 

1699 be resolved. 

1700 destination : `ButlerURI` 

1701 Location to write the file artifacts. 

1702 transfer : `str`, optional 

1703 Method to use to transfer the artifacts. Must be one of the options 

1704 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1705 preserve_path : `bool`, optional 

1706 If `True` the full path of the file artifact within the datastore 

1707 is preserved. If `False` the final file component of the path 

1708 is used. 

1709 overwrite : `bool`, optional 

1710 If `True` allow transfers to overwrite existing files at the 

1711 destination. 

1712 

1713 Returns 

1714 ------- 

1715 targets : `list` of `ButlerURI` 

1716 URIs of file artifacts in destination location. Order is not 

1717 preserved. 

1718 """ 

1719 if not destination.isdir(): 1719 ↛ 1720line 1719 didn't jump to line 1720, because the condition on line 1719 was never true

1720 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1721 

1722 if transfer == "move": 

1723 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1724 

1725 # Source -> Destination 

1726 # This also helps filter out duplicate DatasetRef in the request 

1727 # that will map to the same underlying file transfer. 

1728 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1729 

1730 for ref in refs: 

1731 locations = self._get_dataset_locations_info(ref) 

1732 for location, _ in locations: 

1733 source_uri = location.uri 

1734 target_path: Union[str, ButlerURI] 

1735 if preserve_path: 

1736 target_path = location.pathInStore 

1737 if target_path.isabs(): 1737 ↛ 1740line 1737 didn't jump to line 1740, because the condition on line 1737 was never true

1738 # This is an absolute path to an external file. 

1739 # Use the full path. 

1740 target_path = target_path.relativeToPathRoot 

1741 else: 

1742 target_path = source_uri.basename() 

1743 target_uri = destination.join(target_path) 

1744 to_transfer[source_uri] = target_uri 

1745 

1746 # In theory can now parallelize the transfer 

1747 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1748 for source_uri, target_uri in to_transfer.items(): 

1749 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1750 

1751 return list(to_transfer.values()) 

1752 

1753 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1754 """Load an InMemoryDataset from the store. 

1755 

1756 Parameters 

1757 ---------- 

1758 ref : `DatasetRef` 

1759 Reference to the required Dataset. 

1760 parameters : `dict` 

1761 `StorageClass`-specific parameters that specify, for example, 

1762 a slice of the dataset to be loaded. 

1763 

1764 Returns 

1765 ------- 

1766 inMemoryDataset : `object` 

1767 Requested dataset or slice thereof as an InMemoryDataset. 

1768 

1769 Raises 

1770 ------ 

1771 FileNotFoundError 

1772 Requested dataset can not be retrieved. 

1773 TypeError 

1774 Return value from formatter has unexpected type. 

1775 ValueError 

1776 Formatter failed to process the dataset. 

1777 """ 

1778 allGetInfo = self._prepare_for_get(ref, parameters) 

1779 refComponent = ref.datasetType.component() 

1780 

1781 # Supplied storage class for the component being read 

1782 refStorageClass = ref.datasetType.storageClass 

1783 

1784 # Create mapping from component name to related info 

1785 allComponents = {i.component: i for i in allGetInfo} 

1786 

1787 # By definition the dataset is disassembled if we have more 

1788 # than one record for it. 

1789 isDisassembled = len(allGetInfo) > 1 

1790 

1791 # Look for the special case where we are disassembled but the 

1792 # component is a derived component that was not written during 

1793 # disassembly. For this scenario we need to check that the 

1794 # component requested is listed as a derived component for the 

1795 # composite storage class 

1796 isDisassembledReadOnlyComponent = False 

1797 if isDisassembled and refComponent: 

1798 # The composite storage class should be accessible through 

1799 # the component dataset type 

1800 compositeStorageClass = ref.datasetType.parentStorageClass 

1801 

1802 # In the unlikely scenario where the composite storage 

1803 # class is not known, we can only assume that this is a 

1804 # normal component. If that assumption is wrong then the 

1805 # branch below that reads a persisted component will fail 

1806 # so there is no need to complain here. 

1807 if compositeStorageClass is not None: 1807 ↛ 1810line 1807 didn't jump to line 1810, because the condition on line 1807 was never false

1808 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1809 

1810 if isDisassembled and not refComponent: 

1811 # This was a disassembled dataset spread over multiple files 

1812 # and we need to put them all back together again. 

1813 # Read into memory and then assemble 

1814 

1815 # Check that the supplied parameters are suitable for the type read 

1816 refStorageClass.validateParameters(parameters) 

1817 

1818 # We want to keep track of all the parameters that were not used 

1819 # by formatters. We assume that if any of the component formatters 

1820 # use a parameter that we do not need to apply it again in the 

1821 # assembler. 

1822 usedParams = set() 

1823 

1824 components: Dict[str, Any] = {} 

1825 for getInfo in allGetInfo: 

1826 # assemblerParams are parameters not understood by the 

1827 # associated formatter. 

1828 usedParams.update(set(getInfo.formatterParams)) 

1829 

1830 component = getInfo.component 

1831 

1832 if component is None: 1832 ↛ 1833line 1832 didn't jump to line 1833, because the condition on line 1832 was never true

1833 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1834 

1835 # We do not want the formatter to think it's reading 

1836 # a component though because it is really reading a 

1837 # standalone dataset -- always tell reader it is not a 

1838 # component. 

1839 components[component] = self._read_artifact_into_memory( 

1840 getInfo, ref.makeComponentRef(component), isComponent=False 

1841 ) 

1842 

1843 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1844 

1845 # Any unused parameters will have to be passed to the assembler 

1846 if parameters: 

1847 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1848 else: 

1849 unusedParams = {} 

1850 

1851 # Process parameters 

1852 return ref.datasetType.storageClass.delegate().handleParameters( 

1853 inMemoryDataset, parameters=unusedParams 

1854 ) 

1855 

1856 elif isDisassembledReadOnlyComponent: 

1857 

1858 compositeStorageClass = ref.datasetType.parentStorageClass 

1859 if compositeStorageClass is None: 1859 ↛ 1860line 1859 didn't jump to line 1860, because the condition on line 1859 was never true

1860 raise RuntimeError( 

1861 f"Unable to retrieve derived component '{refComponent}' since" 

1862 "no composite storage class is available." 

1863 ) 

1864 

1865 if refComponent is None: 1865 ↛ 1867line 1865 didn't jump to line 1867, because the condition on line 1865 was never true

1866 # Mainly for mypy 

1867 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1868 

1869 # Assume that every derived component can be calculated by 

1870 # forwarding the request to a single read/write component. 

1871 # Rather than guessing which rw component is the right one by 

1872 # scanning each for a derived component of the same name, 

1873 # we ask the storage class delegate directly which one is best to 

1874 # use. 

1875 compositeDelegate = compositeStorageClass.delegate() 

1876 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

1877 refComponent, set(allComponents) 

1878 ) 

1879 

1880 # Select the relevant component 

1881 rwInfo = allComponents[forwardedComponent] 

1882 

1883 # For now assume that read parameters are validated against 

1884 # the real component and not the requested component 

1885 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1886 forwardedStorageClass.validateParameters(parameters) 

1887 

1888 # The reference to use for the caching must refer to the forwarded 

1889 # component and not the derived component. 

1890 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1891 

1892 # Unfortunately the FileDescriptor inside the formatter will have 

1893 # the wrong write storage class so we need to create a new one 

1894 # given the immutability constraint. 

1895 writeStorageClass = rwInfo.info.storageClass 

1896 

1897 # We may need to put some thought into parameters for read 

1898 # components but for now forward them on as is 

1899 readFormatter = type(rwInfo.formatter)( 

1900 FileDescriptor( 

1901 rwInfo.location, 

1902 readStorageClass=refStorageClass, 

1903 storageClass=writeStorageClass, 

1904 parameters=parameters, 

1905 ), 

1906 ref.dataId, 

1907 ) 

1908 

1909 # The assembler can not receive any parameter requests for a 

1910 # derived component at this time since the assembler will 

1911 # see the storage class of the derived component and those 

1912 # parameters will have to be handled by the formatter on the 

1913 # forwarded storage class. 

1914 assemblerParams: Dict[str, Any] = {} 

1915 

1916 # Need to created a new info that specifies the derived 

1917 # component and associated storage class 

1918 readInfo = DatastoreFileGetInformation( 

1919 rwInfo.location, 

1920 readFormatter, 

1921 rwInfo.info, 

1922 assemblerParams, 

1923 {}, 

1924 refComponent, 

1925 refStorageClass, 

1926 ) 

1927 

1928 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

1929 

1930 else: 

1931 # Single file request or component from that composite file 

1932 for lookup in (refComponent, None): 1932 ↛ 1937line 1932 didn't jump to line 1937, because the loop on line 1932 didn't complete

1933 if lookup in allComponents: 1933 ↛ 1932line 1933 didn't jump to line 1932, because the condition on line 1933 was never false

1934 getInfo = allComponents[lookup] 

1935 break 

1936 else: 

1937 raise FileNotFoundError( 

1938 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

1939 ) 

1940 

1941 # Do not need the component itself if already disassembled 

1942 if isDisassembled: 

1943 isComponent = False 

1944 else: 

1945 isComponent = getInfo.component is not None 

1946 

1947 # For a component read of a composite we want the cache to 

1948 # be looking at the composite ref itself. 

1949 cache_ref = ref.makeCompositeRef() if isComponent else ref 

1950 

1951 # For a disassembled component we can validate parametersagainst 

1952 # the component storage class directly 

1953 if isDisassembled: 

1954 refStorageClass.validateParameters(parameters) 

1955 else: 

1956 # For an assembled composite this could be a derived 

1957 # component derived from a real component. The validity 

1958 # of the parameters is not clear. For now validate against 

1959 # the composite storage class 

1960 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1961 

1962 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

1963 

1964 @transactional 

1965 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1966 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1967 

1968 Parameters 

1969 ---------- 

1970 inMemoryDataset : `object` 

1971 The dataset to store. 

1972 ref : `DatasetRef` 

1973 Reference to the associated Dataset. 

1974 

1975 Raises 

1976 ------ 

1977 TypeError 

1978 Supplied object and storage class are inconsistent. 

1979 DatasetTypeNotSupportedError 

1980 The associated `DatasetType` is not handled by this datastore. 

1981 

1982 Notes 

1983 ----- 

1984 If the datastore is configured to reject certain dataset types it 

1985 is possible that the put will fail and raise a 

1986 `DatasetTypeNotSupportedError`. The main use case for this is to 

1987 allow `ChainedDatastore` to put to multiple datastores without 

1988 requiring that every datastore accepts the dataset. 

1989 """ 

1990 

1991 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1992 # doDisassembly = True 

1993 

1994 artifacts = [] 

1995 if doDisassembly: 

1996 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1997 for component, componentInfo in components.items(): 

1998 # Don't recurse because we want to take advantage of 

1999 # bulk insert -- need a new DatasetRef that refers to the 

2000 # same dataset_id but has the component DatasetType 

2001 # DatasetType does not refer to the types of components 

2002 # So we construct one ourselves. 

2003 compRef = ref.makeComponentRef(component) 

2004 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2005 artifacts.append((compRef, storedInfo)) 

2006 else: 

2007 # Write the entire thing out 

2008 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2009 artifacts.append((ref, storedInfo)) 

2010 

2011 self._register_datasets(artifacts) 

2012 

2013 @transactional 

2014 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2015 # At this point can safely remove these datasets from the cache 

2016 # to avoid confusion later on. If they are not trashed later 

2017 # the cache will simply be refilled. 

2018 self.cacheManager.remove_from_cache(ref) 

2019 

2020 # If we are in trust mode there will be nothing to move to 

2021 # the trash table and we will have to try to delete the file 

2022 # immediately. 

2023 if self.trustGetRequest: 

2024 # Try to keep the logic below for a single file trash. 

2025 if isinstance(ref, DatasetRef): 

2026 refs = {ref} 

2027 else: 

2028 # Will recreate ref at the end of this branch. 

2029 refs = set(ref) 

2030 

2031 # Determine which datasets are known to datastore directly. 

2032 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2033 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2034 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2035 

2036 missing = refs - existing_refs 

2037 if missing: 

2038 # Do an explicit existence check on these refs. 

2039 # We only care about the artifacts at this point and not 

2040 # the dataset existence. 

2041 artifact_existence: Dict[ButlerURI, bool] = {} 

2042 _ = self.mexists(missing, artifact_existence) 

2043 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2044 

2045 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2046 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2047 for uri in uris: 

2048 try: 

2049 uri.remove() 

2050 except Exception as e: 

2051 if ignore_errors: 

2052 log.debug("Artifact %s could not be removed: %s", uri, e) 

2053 continue 

2054 raise 

2055 

2056 # There is no point asking the code below to remove refs we 

2057 # know are missing so update it with the list of existing 

2058 # records. Try to retain one vs many logic. 

2059 if not existing_refs: 

2060 # Nothing more to do since none of the datasets were 

2061 # known to the datastore record table. 

2062 return 

2063 ref = list(existing_refs) 

2064 if len(ref) == 1: 

2065 ref = ref[0] 

2066 

2067 # Get file metadata and internal metadata 

2068 if not isinstance(ref, DatasetRef): 

2069 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2070 # Assumed to be an iterable of refs so bulk mode enabled. 

2071 try: 

2072 self.bridge.moveToTrash(ref) 

2073 except Exception as e: 

2074 if ignore_errors: 

2075 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2076 else: 

2077 raise 

2078 return 

2079 

2080 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2081 

2082 fileLocations = self._get_dataset_locations_info(ref) 

2083 

2084 if not fileLocations: 

2085 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2086 if ignore_errors: 

2087 log.warning(err_msg) 

2088 return 

2089 else: 

2090 raise FileNotFoundError(err_msg) 

2091 

2092 for location, storedFileInfo in fileLocations: 

2093 if not self._artifact_exists(location): 2093 ↛ 2094line 2093 didn't jump to line 2094

2094 err_msg = ( 

2095 f"Dataset is known to datastore {self.name} but " 

2096 f"associated artifact ({location.uri}) is missing" 

2097 ) 

2098 if ignore_errors: 

2099 log.warning(err_msg) 

2100 return 

2101 else: 

2102 raise FileNotFoundError(err_msg) 

2103 

2104 # Mark dataset as trashed 

2105 try: 

2106 self.bridge.moveToTrash([ref]) 

2107 except Exception as e: 

2108 if ignore_errors: 

2109 log.warning( 

2110 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2111 "but encountered an error: %s", 

2112 ref, 

2113 self.name, 

2114 e, 

2115 ) 

2116 pass 

2117 else: 

2118 raise 

2119 

2120 @transactional 

2121 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2122 """Remove all datasets from the trash. 

2123 

2124 Parameters 

2125 ---------- 

2126 ignore_errors : `bool` 

2127 If `True` return without error even if something went wrong. 

2128 Problems could occur if another process is simultaneously trying 

2129 to delete. 

2130 """ 

2131 log.debug("Emptying trash in datastore %s", self.name) 

2132 

2133 # Context manager will empty trash iff we finish it without raising. 

2134 # It will also automatically delete the relevant rows from the 

2135 # trash table and the records table. 

2136 with self.bridge.emptyTrash( 

2137 self._table, record_class=StoredFileInfo, record_column="path" 

2138 ) as trash_data: 

2139 # Removing the artifacts themselves requires that the files are 

2140 # not also associated with refs that are not to be trashed. 

2141 # Therefore need to do a query with the file paths themselves 

2142 # and return all the refs associated with them. Can only delete 

2143 # a file if the refs to be trashed are the only refs associated 

2144 # with the file. 

2145 # This requires multiple copies of the trashed items 

2146 trashed, artifacts_to_keep = trash_data 

2147 

2148 if artifacts_to_keep is None: 

2149 # The bridge is not helping us so have to work it out 

2150 # ourselves. This is not going to be as efficient. 

2151 trashed = list(trashed) 

2152 

2153 # The instance check is for mypy since up to this point it 

2154 # does not know the type of info. 

2155 path_map = self._refs_associated_with_artifacts( 

2156 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2157 ) 

2158 

2159 for ref, info in trashed: 

2160 

2161 # Mypy needs to know this is not the base class 

2162 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2163 

2164 # Check for mypy 

2165 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2166 

2167 path_map[info.path].remove(ref.id) 

2168 if not path_map[info.path]: 2168 ↛ 2159line 2168 didn't jump to line 2159, because the condition on line 2168 was never false

2169 del path_map[info.path] 

2170 

2171 artifacts_to_keep = set(path_map) 

2172 

2173 for ref, info in trashed: 

2174 

2175 # Should not happen for this implementation but need 

2176 # to keep mypy happy. 

2177 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2178 

2179 # Mypy needs to know this is not the base class 

2180 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2181 

2182 # Check for mypy 

2183 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2184 

2185 if info.path in artifacts_to_keep: 

2186 # This is a multi-dataset artifact and we are not 

2187 # removing all associated refs. 

2188 continue 

2189 

2190 # Only trashed refs still known to datastore will be returned. 

2191 location = info.file_location(self.locationFactory) 

2192 

2193 # Point of no return for this artifact 

2194 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2195 try: 

2196 self._delete_artifact(location) 

2197 except FileNotFoundError: 

2198 # If the file itself has been deleted there is nothing 

2199 # we can do about it. It is possible that trash has 

2200 # been run in parallel in another process or someone 

2201 # decided to delete the file. It is unlikely to come 

2202 # back and so we should still continue with the removal 

2203 # of the entry from the trash table. It is also possible 

2204 # we removed it in a previous iteration if it was 

2205 # a multi-dataset artifact. The delete artifact method 

2206 # will log a debug message in this scenario. 

2207 # Distinguishing file missing before trash started and 

2208 # file already removed previously as part of this trash 

2209 # is not worth the distinction with regards to potential 

2210 # memory cost. 

2211 pass 

2212 except Exception as e: 

2213 if ignore_errors: 

2214 # Use a debug message here even though it's not 

2215 # a good situation. In some cases this can be 

2216 # caused by a race between user A and user B 

2217 # and neither of them has permissions for the 

2218 # other's files. Butler does not know about users 

2219 # and trash has no idea what collections these 

2220 # files were in (without guessing from a path). 

2221 log.debug( 

2222 "Encountered error removing artifact %s from datastore %s: %s", 

2223 location.uri, 

2224 self.name, 

2225 e, 

2226 ) 

2227 else: 

2228 raise 

2229 

2230 @transactional 

2231 def transfer_from( 

2232 self, 

2233 source_datastore: Datastore, 

2234 refs: Iterable[DatasetRef], 

2235 local_refs: Optional[Iterable[DatasetRef]] = None, 

2236 transfer: str = "auto", 

2237 artifact_existence: Optional[Dict[ButlerURI, bool]] = None, 

2238 ) -> None: 

2239 # Docstring inherited 

2240 if type(self) is not type(source_datastore): 

2241 raise TypeError( 

2242 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2243 f"source datastore ({type(source_datastore)})." 

2244 ) 

2245 

2246 # Be explicit for mypy 

2247 if not isinstance(source_datastore, FileDatastore): 2247 ↛ 2248line 2247 didn't jump to line 2248, because the condition on line 2247 was never true

2248 raise TypeError( 

2249 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2250 f" {type(source_datastore)}" 

2251 ) 

2252 

2253 # Stop early if "direct" transfer mode is requested. That would 

2254 # require that the URI inside the source datastore should be stored 

2255 # directly in the target datastore, which seems unlikely to be useful 

2256 # since at any moment the source datastore could delete the file. 

2257 if transfer in ("direct", "split"): 

2258 raise ValueError( 

2259 f"Can not transfer from a source datastore using {transfer} mode since" 

2260 " those files are controlled by the other datastore." 

2261 ) 

2262 

2263 # Empty existence lookup if none given. 

2264 if artifact_existence is None: 

2265 artifact_existence = {} 

2266 

2267 # We will go through the list multiple times so must convert 

2268 # generators to lists. 

2269 refs = list(refs) 

2270 

2271 if local_refs is None: 

2272 local_refs = refs 

2273 else: 

2274 local_refs = list(local_refs) 

2275 

2276 # In order to handle disassembled composites the code works 

2277 # at the records level since it can assume that internal APIs 

2278 # can be used. 

2279 # - If the record already exists in the destination this is assumed 

2280 # to be okay. 

2281 # - If there is no record but the source and destination URIs are 

2282 # identical no transfer is done but the record is added. 

2283 # - If the source record refers to an absolute URI currently assume 

2284 # that that URI should remain absolute and will be visible to the 

2285 # destination butler. May need to have a flag to indicate whether 

2286 # the dataset should be transferred. This will only happen if 

2287 # the detached Butler has had a local ingest. 

2288 

2289 # What we really want is all the records in the source datastore 

2290 # associated with these refs. Or derived ones if they don't exist 

2291 # in the source. 

2292 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2293 

2294 # The source dataset_ids are the keys in these records 

2295 source_ids = set(source_records) 

2296 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2297 

2298 # The not None check is to appease mypy 

2299 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2300 missing_ids = requested_ids - source_ids 

2301 

2302 # Missing IDs can be okay if that datastore has allowed 

2303 # gets based on file existence. Should we transfer what we can 

2304 # or complain about it and warn? 

2305 if missing_ids and not source_datastore.trustGetRequest: 2305 ↛ 2306line 2305 didn't jump to line 2306, because the condition on line 2305 was never true

2306 raise ValueError( 

2307 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2308 ) 

2309 

2310 # Need to map these missing IDs to a DatasetRef so we can guess 

2311 # the details. 

2312 if missing_ids: 

2313 log.info( 

2314 "Number of expected datasets missing from source datastore records: %d out of %d", 

2315 len(missing_ids), 

2316 len(requested_ids), 

2317 ) 

2318 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2319 

2320 # This should be chunked in case we end up having to check 

2321 # the file store since we need some log output to show 

2322 # progress. 

2323 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2324 records = {} 

2325 for missing in missing_ids_chunk: 

2326 # Ask the source datastore where the missing artifacts 

2327 # should be. An execution butler might not know about the 

2328 # artifacts even if they are there. 

2329 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2330 records[missing] = [info for _, info in expected] 

2331 

2332 # Call the mexist helper method in case we have not already 

2333 # checked these artifacts such that artifact_existence is 

2334 # empty. This allows us to benefit from parallelism. 

2335 # datastore.mexists() itself does not give us access to the 

2336 # derived datastore record. 

2337 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2338 ref_exists = source_datastore._process_mexists_records( 

2339 id_to_ref, records, False, artifact_existence=artifact_existence 

2340 ) 

2341 

2342 # Now go through the records and propagate the ones that exist. 

2343 location_factory = source_datastore.locationFactory 

2344 for missing, record_list in records.items(): 

2345 # Skip completely if the ref does not exist. 

2346 ref = id_to_ref[missing] 

2347 if not ref_exists[ref]: 

2348 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2349 continue 

2350 # Check for file artifact to decide which parts of a 

2351 # disassembled composite do exist. If there is only a 

2352 # single record we don't even need to look because it can't 

2353 # be a composite and must exist. 

2354 if len(record_list) == 1: 

2355 dataset_records = record_list 

2356 else: 

2357 dataset_records = [ 

2358 record 

2359 for record in record_list 

2360 if artifact_existence[record.file_location(location_factory).uri] 

2361 ] 

2362 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2363 

2364 # Rely on source_records being a defaultdict. 

2365 source_records[missing].extend(dataset_records) 

2366 

2367 # See if we already have these records 

2368 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2369 

2370 # The artifacts to register 

2371 artifacts = [] 

2372 

2373 # Refs that already exist 

2374 already_present = [] 

2375 

2376 # Now can transfer the artifacts 

2377 for source_ref, target_ref in zip(refs, local_refs): 

2378 if target_ref.id in target_records: 

2379 # Already have an artifact for this. 

2380 already_present.append(target_ref) 

2381 continue 

2382 

2383 # mypy needs to know these are always resolved refs 

2384 for info in source_records[source_ref.getCheckedId()]: 

2385 source_location = info.file_location(source_datastore.locationFactory) 

2386 target_location = info.file_location(self.locationFactory) 

2387 if source_location == target_location: 2387 ↛ 2391line 2387 didn't jump to line 2391, because the condition on line 2387 was never true

2388 # Either the dataset is already in the target datastore 

2389 # (which is how execution butler currently runs) or 

2390 # it is an absolute URI. 

2391 if source_location.pathInStore.isabs(): 

2392 # Just because we can see the artifact when running 

2393 # the transfer doesn't mean it will be generally 

2394 # accessible to a user of this butler. For now warn 

2395 # but assume it will be accessible. 

2396 log.warning( 

2397 "Transfer request for an outside-datastore artifact has been found at %s", 

2398 source_location, 

2399 ) 

2400 else: 

2401 # Need to transfer it to the new location. 

2402 # Assume we should always overwrite. If the artifact 

2403 # is there this might indicate that a previous transfer 

2404 # was interrupted but was not able to be rolled back 

2405 # completely (eg pre-emption) so follow Datastore default 

2406 # and overwrite. 

2407 target_location.uri.transfer_from( 

2408 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2409 ) 

2410 

2411 artifacts.append((target_ref, info)) 

2412 

2413 self._register_datasets(artifacts) 

2414 

2415 if already_present: 

2416 n_skipped = len(already_present) 

2417 log.info( 

2418 "Skipped transfer of %d dataset%s already present in datastore", 

2419 n_skipped, 

2420 "" if n_skipped == 1 else "s", 

2421 ) 

2422 

2423 @transactional 

2424 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2425 # Docstring inherited. 

2426 refs = list(refs) 

2427 self.bridge.forget(refs) 

2428 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2429 

2430 def validateConfiguration( 

2431 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2432 ) -> None: 

2433 """Validate some of the configuration for this datastore. 

2434 

2435 Parameters 

2436 ---------- 

2437 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2438 Entities to test against this configuration. Can be differing 

2439 types. 

2440 logFailures : `bool`, optional 

2441 If `True`, output a log message for every validation error 

2442 detected. 

2443 

2444 Raises 

2445 ------ 

2446 DatastoreValidationError 

2447 Raised if there is a validation problem with a configuration. 

2448 All the problems are reported in a single exception. 

2449 

2450 Notes 

2451 ----- 

2452 This method checks that all the supplied entities have valid file 

2453 templates and also have formatters defined. 

2454 """ 

2455 

2456 templateFailed = None 

2457 try: 

2458 self.templates.validateTemplates(entities, logFailures=logFailures) 

2459 except FileTemplateValidationError as e: 

2460 templateFailed = str(e) 

2461 

2462 formatterFailed = [] 

2463 for entity in entities: 

2464 try: 

2465 self.formatterFactory.getFormatterClass(entity) 

2466 except KeyError as e: 

2467 formatterFailed.append(str(e)) 

2468 if logFailures: 2468 ↛ 2463line 2468 didn't jump to line 2463, because the condition on line 2468 was never false

2469 log.critical("Formatter failure: %s", e) 

2470 

2471 if templateFailed or formatterFailed: 

2472 messages = [] 

2473 if templateFailed: 2473 ↛ 2474line 2473 didn't jump to line 2474, because the condition on line 2473 was never true

2474 messages.append(templateFailed) 

2475 if formatterFailed: 2475 ↛ 2477line 2475 didn't jump to line 2477, because the condition on line 2475 was never false

2476 messages.append(",".join(formatterFailed)) 

2477 msg = ";\n".join(messages) 

2478 raise DatastoreValidationError(msg) 

2479 

2480 def getLookupKeys(self) -> Set[LookupKey]: 

2481 # Docstring is inherited from base class 

2482 return ( 

2483 self.templates.getLookupKeys() 

2484 | self.formatterFactory.getLookupKeys() 

2485 | self.constraints.getLookupKeys() 

2486 ) 

2487 

2488 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2489 # Docstring is inherited from base class 

2490 # The key can be valid in either formatters or templates so we can 

2491 # only check the template if it exists 

2492 if lookupKey in self.templates: 

2493 try: 

2494 self.templates[lookupKey].validateTemplate(entity) 

2495 except FileTemplateValidationError as e: 

2496 raise DatastoreValidationError(e) from e 

2497 

2498 def export( 

2499 self, 

2500 refs: Iterable[DatasetRef], 

2501 *, 

2502 directory: Optional[Union[ButlerURI, str]] = None, 

2503 transfer: Optional[str] = "auto", 

2504 ) -> Iterable[FileDataset]: 

2505 # Docstring inherited from Datastore.export. 

2506 if transfer is not None and directory is None: 2506 ↛ 2507line 2506 didn't jump to line 2507, because the condition on line 2506 was never true

2507 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2508 

2509 # Force the directory to be a URI object 

2510 directoryUri: Optional[ButlerURI] = None 

2511 if directory is not None: 2511 ↛ 2514line 2511 didn't jump to line 2514, because the condition on line 2511 was never false

2512 directoryUri = ButlerURI(directory, forceDirectory=True) 

2513 

2514 if transfer is not None and directoryUri is not None: 2514 ↛ 2519line 2514 didn't jump to line 2519, because the condition on line 2514 was never false

2515 # mypy needs the second test 

2516 if not directoryUri.exists(): 2516 ↛ 2517line 2516 didn't jump to line 2517, because the condition on line 2516 was never true

2517 raise FileNotFoundError(f"Export location {directory} does not exist") 

2518 

2519 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2520 for ref in progress.wrap(refs, "Exporting dataset files"): 

2521 fileLocations = self._get_dataset_locations_info(ref) 

2522 if not fileLocations: 2522 ↛ 2523line 2522 didn't jump to line 2523, because the condition on line 2522 was never true

2523 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2524 # For now we can not export disassembled datasets 

2525 if len(fileLocations) > 1: 

2526 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2527 location, storedFileInfo = fileLocations[0] 

2528 

2529 pathInStore = location.pathInStore.path 

2530 if transfer is None: 2530 ↛ 2534line 2530 didn't jump to line 2534, because the condition on line 2530 was never true

2531 # TODO: do we also need to return the readStorageClass somehow? 

2532 # We will use the path in store directly. If this is an 

2533 # absolute URI, preserve it. 

2534 if location.pathInStore.isabs(): 

2535 pathInStore = str(location.uri) 

2536 elif transfer == "direct": 2536 ↛ 2538line 2536 didn't jump to line 2538, because the condition on line 2536 was never true

2537 # Use full URIs to the remote store in the export 

2538 pathInStore = str(location.uri) 

2539 else: 

2540 # mypy needs help 

2541 assert directoryUri is not None, "directoryUri must be defined to get here" 

2542 storeUri = ButlerURI(location.uri) 

2543 

2544 # if the datastore has an absolute URI to a resource, we 

2545 # have two options: 

2546 # 1. Keep the absolute URI in the exported YAML 

2547 # 2. Allocate a new name in the local datastore and transfer 

2548 # it. 

2549 # For now go with option 2 

2550 if location.pathInStore.isabs(): 2550 ↛ 2551line 2550 didn't jump to line 2551, because the condition on line 2550 was never true

2551 template = self.templates.getTemplate(ref) 

2552 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

2553 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2554 

2555 exportUri = directoryUri.join(pathInStore) 

2556 exportUri.transfer_from(storeUri, transfer=transfer) 

2557 

2558 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2559 

2560 @staticmethod 

2561 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

2562 """Compute the checksum of the supplied file. 

2563 

2564 Parameters 

2565 ---------- 

2566 uri : `ButlerURI` 

2567 Name of resource to calculate checksum from. 

2568 algorithm : `str`, optional 

2569 Name of algorithm to use. Must be one of the algorithms supported 

2570 by :py:class`hashlib`. 

2571 block_size : `int` 

2572 Number of bytes to read from file at one time. 

2573 

2574 Returns 

2575 ------- 

2576 hexdigest : `str` 

2577 Hex digest of the file. 

2578 

2579 Notes 

2580 ----- 

2581 Currently returns None if the URI is for a remote resource. 

2582 """ 

2583 if algorithm not in hashlib.algorithms_guaranteed: 2583 ↛ 2584line 2583 didn't jump to line 2584, because the condition on line 2583 was never true

2584 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2585 

2586 if not uri.isLocal: 2586 ↛ 2587line 2586 didn't jump to line 2587, because the condition on line 2586 was never true

2587 return None 

2588 

2589 hasher = hashlib.new(algorithm) 

2590 

2591 with uri.as_local() as local_uri: 

2592 with open(local_uri.ospath, "rb") as f: 

2593 for chunk in iter(lambda: f.read(block_size), b""): 

2594 hasher.update(chunk) 

2595 

2596 return hasher.hexdigest() 

2597 

2598 def needs_expanded_data_ids( 

2599 self, 

2600 transfer: Optional[str], 

2601 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2602 ) -> bool: 

2603 # Docstring inherited. 

2604 # This _could_ also use entity to inspect whether the filename template 

2605 # involves placeholders other than the required dimensions for its 

2606 # dataset type, but that's not necessary for correctness; it just 

2607 # enables more optimizations (perhaps only in theory). 

2608 return transfer not in ("direct", None)