Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29 

30from sqlalchemy import BigInteger, String 

31 

32from collections import defaultdict 

33from dataclasses import dataclass 

34from typing import ( 

35 TYPE_CHECKING, 

36 Any, 

37 ClassVar, 

38 Dict, 

39 Iterable, 

40 List, 

41 Mapping, 

42 Optional, 

43 Set, 

44 Tuple, 

45 Type, 

46 Union, 

47) 

48 

49from lsst.utils.iteration import chunk_iterable 

50from lsst.utils.introspection import get_class_of, get_instance_of 

51from lsst.utils.timer import time_this 

52 

53# For VERBOSE logging usage. 

54from lsst.utils.logging import getLogger, VERBOSE 

55 

56from lsst.daf.butler import ( 

57 ButlerURI, 

58 CompositesMap, 

59 Config, 

60 FileDataset, 

61 DatasetId, 

62 DatasetRef, 

63 DatasetType, 

64 DatasetTypeNotSupportedError, 

65 Datastore, 

66 DatastoreCacheManager, 

67 DatastoreDisabledCacheManager, 

68 DatastoreConfig, 

69 DatastoreValidationError, 

70 FileDescriptor, 

71 FileTemplates, 

72 FileTemplateValidationError, 

73 Formatter, 

74 FormatterFactory, 

75 Location, 

76 LocationFactory, 

77 Progress, 

78 StorageClass, 

79 StoredFileInfo, 

80) 

81 

82from lsst.daf.butler import ddl 

83from lsst.daf.butler.registry.interfaces import ( 

84 ReadOnlyDatabaseError, 

85 DatastoreRegistryBridge, 

86) 

87 

88from lsst.daf.butler.core.repoRelocation import replaceRoot 

89from lsst.daf.butler.core.utils import transactional 

90from .genericDatastore import GenericBaseDatastore 

91 

92if TYPE_CHECKING: 92 ↛ 93line 92 didn't jump to line 93, because the condition on line 92 was never true

93 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

94 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

95 

96log = getLogger(__name__) 

97 

98 

99class _IngestPrepData(Datastore.IngestPrepData): 

100 """Helper class for FileDatastore ingest implementation. 

101 

102 Parameters 

103 ---------- 

104 datasets : `list` of `FileDataset` 

105 Files to be ingested by this datastore. 

106 """ 

107 def __init__(self, datasets: List[FileDataset]): 

108 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

109 self.datasets = datasets 

110 

111 

112@dataclass(frozen=True) 

113class DatastoreFileGetInformation: 

114 """Collection of useful parameters needed to retrieve a file from 

115 a Datastore. 

116 """ 

117 

118 location: Location 

119 """The location from which to read the dataset.""" 

120 

121 formatter: Formatter 

122 """The `Formatter` to use to deserialize the dataset.""" 

123 

124 info: StoredFileInfo 

125 """Stored information about this file and its formatter.""" 

126 

127 assemblerParams: Dict[str, Any] 

128 """Parameters to use for post-processing the retrieved dataset.""" 

129 

130 formatterParams: Dict[str, Any] 

131 """Parameters that were understood by the associated formatter.""" 

132 

133 component: Optional[str] 

134 """The component to be retrieved (can be `None`).""" 

135 

136 readStorageClass: StorageClass 

137 """The `StorageClass` of the dataset being read.""" 

138 

139 

140class FileDatastore(GenericBaseDatastore): 

141 """Generic Datastore for file-based implementations. 

142 

143 Should always be sub-classed since key abstract methods are missing. 

144 

145 Parameters 

146 ---------- 

147 config : `DatastoreConfig` or `str` 

148 Configuration as either a `Config` object or URI to file. 

149 bridgeManager : `DatastoreRegistryBridgeManager` 

150 Object that manages the interface between `Registry` and datastores. 

151 butlerRoot : `str`, optional 

152 New datastore root to use to override the configuration value. 

153 

154 Raises 

155 ------ 

156 ValueError 

157 If root location does not exist and ``create`` is `False` in the 

158 configuration. 

159 """ 

160 

161 defaultConfigFile: ClassVar[Optional[str]] = None 

162 """Path to configuration defaults. Accessed within the ``config`` resource 

163 or relative to a search path. Can be None if no defaults specified. 

164 """ 

165 

166 root: ButlerURI 

167 """Root directory URI of this `Datastore`.""" 

168 

169 locationFactory: LocationFactory 

170 """Factory for creating locations relative to the datastore root.""" 

171 

172 formatterFactory: FormatterFactory 

173 """Factory for creating instances of formatters.""" 

174 

175 templates: FileTemplates 

176 """File templates that can be used by this `Datastore`.""" 

177 

178 composites: CompositesMap 

179 """Determines whether a dataset should be disassembled on put.""" 

180 

181 defaultConfigFile = "datastores/fileDatastore.yaml" 

182 """Path to configuration defaults. Accessed within the ``config`` resource 

183 or relative to a search path. Can be None if no defaults specified. 

184 """ 

185 

186 @classmethod 

187 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

188 """Set any filesystem-dependent config options for this Datastore to 

189 be appropriate for a new empty repository with the given root. 

190 

191 Parameters 

192 ---------- 

193 root : `str` 

194 URI to the root of the data repository. 

195 config : `Config` 

196 A `Config` to update. Only the subset understood by 

197 this component will be updated. Will not expand 

198 defaults. 

199 full : `Config` 

200 A complete config with all defaults expanded that can be 

201 converted to a `DatastoreConfig`. Read-only and will not be 

202 modified by this method. 

203 Repository-specific options that should not be obtained 

204 from defaults when Butler instances are constructed 

205 should be copied from ``full`` to ``config``. 

206 overwrite : `bool`, optional 

207 If `False`, do not modify a value in ``config`` if the value 

208 already exists. Default is always to overwrite with the provided 

209 ``root``. 

210 

211 Notes 

212 ----- 

213 If a keyword is explicitly defined in the supplied ``config`` it 

214 will not be overridden by this method if ``overwrite`` is `False`. 

215 This allows explicit values set in external configs to be retained. 

216 """ 

217 Config.updateParameters(DatastoreConfig, config, full, 

218 toUpdate={"root": root}, 

219 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

220 

221 @classmethod 

222 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

223 return ddl.TableSpec( 

224 fields=[ 

225 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

226 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

227 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

228 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

229 # Use empty string to indicate no component 

230 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

231 # TODO: should checksum be Base64Bytes instead? 

232 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

233 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

234 ], 

235 unique=frozenset(), 

236 indexes=[tuple(["path"])], 

237 ) 

238 

239 def __init__(self, config: Union[DatastoreConfig, str], 

240 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

241 super().__init__(config, bridgeManager) 

242 if "root" not in self.config: 242 ↛ 243line 242 didn't jump to line 243, because the condition on line 242 was never true

243 raise ValueError("No root directory specified in configuration") 

244 

245 # Name ourselves either using an explicit name or a name 

246 # derived from the (unexpanded) root 

247 if "name" in self.config: 

248 self.name = self.config["name"] 

249 else: 

250 # We use the unexpanded root in the name to indicate that this 

251 # datastore can be moved without having to update registry. 

252 self.name = "{}@{}".format(type(self).__name__, 

253 self.config["root"]) 

254 

255 # Support repository relocation in config 

256 # Existence of self.root is checked in subclass 

257 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

258 forceDirectory=True, forceAbsolute=True) 

259 

260 self.locationFactory = LocationFactory(self.root) 

261 self.formatterFactory = FormatterFactory() 

262 

263 # Now associate formatters with storage classes 

264 self.formatterFactory.registerFormatters(self.config["formatters"], 

265 universe=bridgeManager.universe) 

266 

267 # Read the file naming templates 

268 self.templates = FileTemplates(self.config["templates"], 

269 universe=bridgeManager.universe) 

270 

271 # See if composites should be disassembled 

272 self.composites = CompositesMap(self.config["composites"], 

273 universe=bridgeManager.universe) 

274 

275 tableName = self.config["records", "table"] 

276 try: 

277 # Storage of paths and formatters, keyed by dataset_id 

278 self._table = bridgeManager.opaque.register( 

279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)) 

280 # Interface to Registry. 

281 self._bridge = bridgeManager.register(self.name) 

282 except ReadOnlyDatabaseError: 

283 # If the database is read only and we just tried and failed to 

284 # create a table, it means someone is trying to create a read-only 

285 # butler client for an empty repo. That should be okay, as long 

286 # as they then try to get any datasets before some other client 

287 # creates the table. Chances are they'rejust validating 

288 # configuration. 

289 pass 

290 

291 # Determine whether checksums should be used - default to False 

292 self.useChecksum = self.config.get("checksum", False) 

293 

294 # Determine whether we can fall back to configuration if a 

295 # requested dataset is not known to registry 

296 self.trustGetRequest = self.config.get("trust_get_request", False) 

297 

298 # Create a cache manager 

299 self.cacheManager: AbstractDatastoreCacheManager 

300 if "cached" in self.config: 300 ↛ 304line 300 didn't jump to line 304, because the condition on line 300 was never false

301 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

302 universe=bridgeManager.universe) 

303 else: 

304 self.cacheManager = DatastoreDisabledCacheManager("", 

305 universe=bridgeManager.universe) 

306 

307 # Check existence and create directory structure if necessary 

308 if not self.root.exists(): 

309 if "create" not in self.config or not self.config["create"]: 309 ↛ 310line 309 didn't jump to line 310, because the condition on line 309 was never true

310 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

311 try: 

312 self.root.mkdir() 

313 except Exception as e: 

314 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

315 f" Got error: {e}") from e 

316 

317 def __str__(self) -> str: 

318 return str(self.root) 

319 

320 @property 

321 def bridge(self) -> DatastoreRegistryBridge: 

322 return self._bridge 

323 

324 def _artifact_exists(self, location: Location) -> bool: 

325 """Check that an artifact exists in this datastore at the specified 

326 location. 

327 

328 Parameters 

329 ---------- 

330 location : `Location` 

331 Expected location of the artifact associated with this datastore. 

332 

333 Returns 

334 ------- 

335 exists : `bool` 

336 True if the location can be found, false otherwise. 

337 """ 

338 log.debug("Checking if resource exists: %s", location.uri) 

339 return location.uri.exists() 

340 

341 def _delete_artifact(self, location: Location) -> None: 

342 """Delete the artifact from the datastore. 

343 

344 Parameters 

345 ---------- 

346 location : `Location` 

347 Location of the artifact associated with this datastore. 

348 """ 

349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true

350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

351 

352 try: 

353 location.uri.remove() 

354 except FileNotFoundError: 

355 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

356 raise 

357 except Exception as e: 

358 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

359 raise 

360 log.debug("Successfully deleted file: %s", location.uri) 

361 

362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

363 # Docstring inherited from GenericBaseDatastore 

364 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

365 self._table.insert(*records) 

366 

367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

368 # Docstring inherited from GenericBaseDatastore 

369 

370 # Look for the dataset_id -- there might be multiple matches 

371 # if we have disassembled the dataset. 

372 records = self._table.fetch(dataset_id=ref.id) 

373 return [StoredFileInfo.from_record(record) for record in records] 

374 

375 def _get_stored_records_associated_with_refs(self, 

376 refs: Iterable[DatasetIdRef] 

377 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

378 """Retrieve all records associated with the provided refs. 

379 

380 Parameters 

381 ---------- 

382 refs : iterable of `DatasetIdRef` 

383 The refs for which records are to be retrieved. 

384 

385 Returns 

386 ------- 

387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

388 The matching records indexed by the ref ID. The number of entries 

389 in the dict can be smaller than the number of requested refs. 

390 """ 

391 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

392 

393 # Uniqueness is dataset_id + component so can have multiple records 

394 # per ref. 

395 records_by_ref = defaultdict(list) 

396 for record in records: 

397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

398 return records_by_ref 

399 

400 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str, 

401 Set[DatasetId]]: 

402 """Return paths and associated dataset refs. 

403 

404 Parameters 

405 ---------- 

406 paths : `list` of `str` or `ButlerURI` 

407 All the paths to include in search. 

408 

409 Returns 

410 ------- 

411 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

412 Mapping of each path to a set of associated database IDs. 

413 """ 

414 records = self._table.fetch(path=[str(path) for path in paths]) 

415 result = defaultdict(set) 

416 for row in records: 

417 result[row["path"]].add(row["dataset_id"]) 

418 return result 

419 

420 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

421 """Return all dataset refs associated with the supplied path. 

422 

423 Parameters 

424 ---------- 

425 pathInStore : `ButlerURI` 

426 Path of interest in the data store. 

427 

428 Returns 

429 ------- 

430 ids : `set` of `int` 

431 All `DatasetRef` IDs associated with this path. 

432 """ 

433 records = list(self._table.fetch(path=str(pathInStore))) 

434 ids = {r["dataset_id"] for r in records} 

435 return ids 

436 

437 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

438 # Docstring inherited from GenericBaseDatastore 

439 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

440 

441 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

442 r"""Find all the `Location`\ s of the requested dataset in the 

443 `Datastore` and the associated stored file information. 

444 

445 Parameters 

446 ---------- 

447 ref : `DatasetRef` 

448 Reference to the required `Dataset`. 

449 

450 Returns 

451 ------- 

452 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

453 Location of the dataset within the datastore and 

454 stored information about each file and its formatter. 

455 """ 

456 # Get the file information (this will fail if no file) 

457 records = self.getStoredItemsInfo(ref) 

458 

459 # Use the path to determine the location -- we need to take 

460 # into account absolute URIs in the datastore record 

461 return [(r.file_location(self.locationFactory), r) for r in records] 

462 

463 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

464 """Check that there is only one dataset associated with the 

465 specified artifact. 

466 

467 Parameters 

468 ---------- 

469 ref : `DatasetRef` or `FakeDatasetRef` 

470 Dataset to be removed. 

471 location : `Location` 

472 The location of the artifact to be removed. 

473 

474 Returns 

475 ------- 

476 can_remove : `Bool` 

477 True if the artifact can be safely removed. 

478 """ 

479 # Can't ever delete absolute URIs. 

480 if location.pathInStore.isabs(): 

481 return False 

482 

483 # Get all entries associated with this path 

484 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

485 if not allRefs: 

486 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

487 

488 # Remove these refs from all the refs and if there is nothing left 

489 # then we can delete 

490 remainingRefs = allRefs - {ref.id} 

491 

492 if remainingRefs: 

493 return False 

494 return True 

495 

496 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

497 StoredFileInfo]]: 

498 """Predict the location and related file information of the requested 

499 dataset in this datastore. 

500 

501 Parameters 

502 ---------- 

503 ref : `DatasetRef` 

504 Reference to the required `Dataset`. 

505 

506 Returns 

507 ------- 

508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

509 Expected Location of the dataset within the datastore and 

510 placeholder information about each file and its formatter. 

511 

512 Notes 

513 ----- 

514 Uses the current configuration to determine how we would expect the 

515 datastore files to have been written if we couldn't ask registry. 

516 This is safe so long as there has been no change to datastore 

517 configuration between writing the dataset and wanting to read it. 

518 Will not work for files that have been ingested without using the 

519 standard file template or default formatter. 

520 """ 

521 

522 # If we have a component ref we always need to ask the questions 

523 # of the composite. If the composite is disassembled this routine 

524 # should return all components. If the composite was not 

525 # disassembled the composite is what is stored regardless of 

526 # component request. Note that if the caller has disassembled 

527 # a composite there is no way for this guess to know that 

528 # without trying both the composite and component ref and seeing 

529 # if there is something at the component Location even without 

530 # disassembly being enabled. 

531 if ref.datasetType.isComponent(): 

532 ref = ref.makeCompositeRef() 

533 

534 # See if the ref is a composite that should be disassembled 

535 doDisassembly = self.composites.shouldBeDisassembled(ref) 

536 

537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

538 

539 if doDisassembly: 

540 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

541 compRef = ref.makeComponentRef(component) 

542 location, formatter = self._determine_put_formatter_location(compRef) 

543 all_info.append((location, formatter, componentStorage, component)) 

544 

545 else: 

546 # Always use the composite ref if no disassembly 

547 location, formatter = self._determine_put_formatter_location(ref) 

548 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

549 

550 # Convert the list of tuples to have StoredFileInfo as second element 

551 return [(location, StoredFileInfo(formatter=formatter, 

552 path=location.pathInStore.path, 

553 storageClass=storageClass, 

554 component=component, 

555 checksum=None, 

556 file_size=-1)) 

557 for location, formatter, storageClass, component in all_info] 

558 

559 def _prepare_for_get(self, ref: DatasetRef, 

560 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

561 """Check parameters for ``get`` and obtain formatter and 

562 location. 

563 

564 Parameters 

565 ---------- 

566 ref : `DatasetRef` 

567 Reference to the required Dataset. 

568 parameters : `dict` 

569 `StorageClass`-specific parameters that specify, for example, 

570 a slice of the dataset to be loaded. 

571 

572 Returns 

573 ------- 

574 getInfo : `list` [`DatastoreFileGetInformation`] 

575 Parameters needed to retrieve each file. 

576 """ 

577 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

578 

579 # Get file metadata and internal metadata 

580 fileLocations = self._get_dataset_locations_info(ref) 

581 if not fileLocations: 

582 if not self.trustGetRequest: 

583 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

584 # Assume the dataset is where we think it should be 

585 fileLocations = self._get_expected_dataset_locations_info(ref) 

586 

587 # The storage class we want to use eventually 

588 refStorageClass = ref.datasetType.storageClass 

589 

590 if len(fileLocations) > 1: 

591 disassembled = True 

592 

593 # If trust is involved it is possible that there will be 

594 # components listed here that do not exist in the datastore. 

595 # Explicitly check for file artifact existence and filter out any 

596 # that are missing. 

597 if self.trustGetRequest: 

598 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

599 

600 # For now complain only if we have no components at all. One 

601 # component is probably a problem but we can punt that to the 

602 # assembler. 

603 if not fileLocations: 603 ↛ 604line 603 didn't jump to line 604, because the condition on line 603 was never true

604 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

605 

606 else: 

607 disassembled = False 

608 

609 # Is this a component request? 

610 refComponent = ref.datasetType.component() 

611 

612 fileGetInfo = [] 

613 for location, storedFileInfo in fileLocations: 

614 

615 # The storage class used to write the file 

616 writeStorageClass = storedFileInfo.storageClass 

617 

618 # If this has been disassembled we need read to match the write 

619 if disassembled: 

620 readStorageClass = writeStorageClass 

621 else: 

622 readStorageClass = refStorageClass 

623 

624 formatter = get_instance_of(storedFileInfo.formatter, 

625 FileDescriptor(location, readStorageClass=readStorageClass, 

626 storageClass=writeStorageClass, parameters=parameters), 

627 ref.dataId) 

628 

629 formatterParams, notFormatterParams = formatter.segregateParameters() 

630 

631 # Of the remaining parameters, extract the ones supported by 

632 # this StorageClass (for components not all will be handled) 

633 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

634 

635 # The ref itself could be a component if the dataset was 

636 # disassembled by butler, or we disassembled in datastore and 

637 # components came from the datastore records 

638 component = storedFileInfo.component if storedFileInfo.component else refComponent 

639 

640 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

641 assemblerParams, formatterParams, 

642 component, readStorageClass)) 

643 

644 return fileGetInfo 

645 

646 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

647 """Check the arguments for ``put`` and obtain formatter and 

648 location. 

649 

650 Parameters 

651 ---------- 

652 inMemoryDataset : `object` 

653 The dataset to store. 

654 ref : `DatasetRef` 

655 Reference to the associated Dataset. 

656 

657 Returns 

658 ------- 

659 location : `Location` 

660 The location to write the dataset. 

661 formatter : `Formatter` 

662 The `Formatter` to use to write the dataset. 

663 

664 Raises 

665 ------ 

666 TypeError 

667 Supplied object and storage class are inconsistent. 

668 DatasetTypeNotSupportedError 

669 The associated `DatasetType` is not handled by this datastore. 

670 """ 

671 self._validate_put_parameters(inMemoryDataset, ref) 

672 return self._determine_put_formatter_location(ref) 

673 

674 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

675 """Calculate the formatter and output location to use for put. 

676 

677 Parameters 

678 ---------- 

679 ref : `DatasetRef` 

680 Reference to the associated Dataset. 

681 

682 Returns 

683 ------- 

684 location : `Location` 

685 The location to write the dataset. 

686 formatter : `Formatter` 

687 The `Formatter` to use to write the dataset. 

688 """ 

689 # Work out output file name 

690 try: 

691 template = self.templates.getTemplate(ref) 

692 except KeyError as e: 

693 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

694 

695 # Validate the template to protect against filenames from different 

696 # dataIds returning the same and causing overwrite confusion. 

697 template.validateTemplate(ref) 

698 

699 location = self.locationFactory.fromPath(template.format(ref)) 

700 

701 # Get the formatter based on the storage class 

702 storageClass = ref.datasetType.storageClass 

703 try: 

704 formatter = self.formatterFactory.getFormatter(ref, 

705 FileDescriptor(location, 

706 storageClass=storageClass), 

707 ref.dataId) 

708 except KeyError as e: 

709 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

710 f"{self.name}") from e 

711 

712 # Now that we know the formatter, update the location 

713 location = formatter.makeUpdatedLocation(location) 

714 

715 return location, formatter 

716 

717 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

718 # Docstring inherited from base class 

719 if transfer != "auto": 

720 return transfer 

721 

722 # See if the paths are within the datastore or not 

723 inside = [self._pathInStore(d.path) is not None for d in datasets] 

724 

725 if all(inside): 

726 transfer = None 

727 elif not any(inside): 727 ↛ 736line 727 didn't jump to line 736, because the condition on line 727 was never false

728 # Allow ButlerURI to use its own knowledge 

729 transfer = "auto" 

730 else: 

731 # This can happen when importing from a datastore that 

732 # has had some datasets ingested using "direct" mode. 

733 # Also allow ButlerURI to sort it out but warn about it. 

734 # This can happen if you are importing from a datastore 

735 # that had some direct transfer datasets. 

736 log.warning("Some datasets are inside the datastore and some are outside. Using 'split' " 

737 "transfer mode. This assumes that the files outside the datastore are " 

738 "still accessible to the new butler since they will not be copied into " 

739 "the target datastore.") 

740 transfer = "split" 

741 

742 return transfer 

743 

744 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

745 """Return path relative to datastore root 

746 

747 Parameters 

748 ---------- 

749 path : `str` or `ButlerURI` 

750 Path to dataset. Can be absolute URI. If relative assumed to 

751 be relative to the datastore. Returns path in datastore 

752 or raises an exception if the path it outside. 

753 

754 Returns 

755 ------- 

756 inStore : `str` 

757 Path relative to datastore root. Returns `None` if the file is 

758 outside the root. 

759 """ 

760 # Relative path will always be relative to datastore 

761 pathUri = ButlerURI(path, forceAbsolute=False) 

762 return pathUri.relative_to(self.root) 

763 

764 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

765 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

766 """Standardize the path of a to-be-ingested file. 

767 

768 Parameters 

769 ---------- 

770 path : `str` or `ButlerURI` 

771 Path of a file to be ingested. 

772 transfer : `str`, optional 

773 How (and whether) the dataset should be added to the datastore. 

774 See `ingest` for details of transfer modes. 

775 This implementation is provided only so 

776 `NotImplementedError` can be raised if the mode is not supported; 

777 actual transfers are deferred to `_extractIngestInfo`. 

778 

779 Returns 

780 ------- 

781 path : `str` or `ButlerURI` 

782 New path in what the datastore considers standard form. If an 

783 absolute URI was given that will be returned unchanged. 

784 

785 Notes 

786 ----- 

787 Subclasses of `FileDatastore` can implement this method instead 

788 of `_prepIngest`. It should not modify the data repository or given 

789 file in any way. 

790 

791 Raises 

792 ------ 

793 NotImplementedError 

794 Raised if the datastore does not support the given transfer mode 

795 (including the case where ingest is not supported at all). 

796 FileNotFoundError 

797 Raised if one of the given files does not exist. 

798 """ 

799 if transfer not in (None, "direct", "split") + self.root.transferModes: 799 ↛ 800line 799 didn't jump to line 800, because the condition on line 799 was never true

800 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

801 

802 # A relative URI indicates relative to datastore root 

803 srcUri = ButlerURI(path, forceAbsolute=False) 

804 if not srcUri.isabs(): 

805 srcUri = self.root.join(path) 

806 

807 if not srcUri.exists(): 

808 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

809 f"are assumed to be relative to {self.root} unless they are absolute.") 

810 

811 if transfer is None: 

812 relpath = srcUri.relative_to(self.root) 

813 if not relpath: 

814 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

815 f"within datastore ({self.root})") 

816 

817 # Return the relative path within the datastore for internal 

818 # transfer 

819 path = relpath 

820 

821 return path 

822 

823 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

824 formatter: Union[Formatter, Type[Formatter]], 

825 transfer: Optional[str] = None) -> StoredFileInfo: 

826 """Relocate (if necessary) and extract `StoredFileInfo` from a 

827 to-be-ingested file. 

828 

829 Parameters 

830 ---------- 

831 path : `str` or `ButlerURI` 

832 URI or path of a file to be ingested. 

833 ref : `DatasetRef` 

834 Reference for the dataset being ingested. Guaranteed to have 

835 ``dataset_id not None`. 

836 formatter : `type` or `Formatter` 

837 `Formatter` subclass to use for this dataset or an instance. 

838 transfer : `str`, optional 

839 How (and whether) the dataset should be added to the datastore. 

840 See `ingest` for details of transfer modes. 

841 

842 Returns 

843 ------- 

844 info : `StoredFileInfo` 

845 Internal datastore record for this file. This will be inserted by 

846 the caller; the `_extractIngestInfo` is only resposible for 

847 creating and populating the struct. 

848 

849 Raises 

850 ------ 

851 FileNotFoundError 

852 Raised if one of the given files does not exist. 

853 FileExistsError 

854 Raised if transfer is not `None` but the (internal) location the 

855 file would be moved to is already occupied. 

856 """ 

857 if self._transaction is None: 857 ↛ 858line 857 didn't jump to line 858, because the condition on line 857 was never true

858 raise RuntimeError("Ingest called without transaction enabled") 

859 

860 # Create URI of the source path, do not need to force a relative 

861 # path to absolute. 

862 srcUri = ButlerURI(path, forceAbsolute=False) 

863 

864 # Track whether we have read the size of the source yet 

865 have_sized = False 

866 

867 tgtLocation: Optional[Location] 

868 if transfer is None or transfer == "split": 

869 # A relative path is assumed to be relative to the datastore 

870 # in this context 

871 if not srcUri.isabs(): 

872 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

873 else: 

874 # Work out the path in the datastore from an absolute URI 

875 # This is required to be within the datastore. 

876 pathInStore = srcUri.relative_to(self.root) 

877 if pathInStore is None and transfer is None: 877 ↛ 878line 877 didn't jump to line 878, because the condition on line 877 was never true

878 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

879 f"not within datastore {self.root}") 

880 if pathInStore: 880 ↛ 882line 880 didn't jump to line 882, because the condition on line 880 was never false

881 tgtLocation = self.locationFactory.fromPath(pathInStore) 

882 elif transfer == "split": 

883 # Outside the datastore but treat that as a direct ingest 

884 # instead. 

885 tgtLocation = None 

886 else: 

887 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for" 

888 f" URI {srcUri}") 

889 elif transfer == "direct": 889 ↛ 894line 889 didn't jump to line 894, because the condition on line 889 was never true

890 # Want to store the full URI to the resource directly in 

891 # datastore. This is useful for referring to permanent archive 

892 # storage for raw data. 

893 # Trust that people know what they are doing. 

894 tgtLocation = None 

895 else: 

896 # Work out the name we want this ingested file to have 

897 # inside the datastore 

898 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

899 if not tgtLocation.uri.dirname().exists(): 

900 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

901 tgtLocation.uri.dirname().mkdir() 

902 

903 # if we are transferring from a local file to a remote location 

904 # it may be more efficient to get the size and checksum of the 

905 # local file rather than the transferred one 

906 if not srcUri.scheme or srcUri.scheme == "file": 906 ↛ 916line 906 didn't jump to line 916, because the condition on line 906 was never false

907 size = srcUri.size() 

908 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

909 have_sized = True 

910 

911 # Transfer the resource to the destination. 

912 # Allow overwrite of an existing file. This matches the behavior 

913 # of datastore.put() in that it trusts that registry would not 

914 # be asking to overwrite unless registry thought that the 

915 # overwrite was allowed. 

916 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction, 

917 overwrite=True) 

918 

919 if tgtLocation is None: 919 ↛ 921line 919 didn't jump to line 921, because the condition on line 919 was never true

920 # This means we are using direct mode 

921 targetUri = srcUri 

922 targetPath = str(srcUri) 

923 else: 

924 targetUri = tgtLocation.uri 

925 targetPath = tgtLocation.pathInStore.path 

926 

927 # the file should exist in the datastore now 

928 if not have_sized: 

929 size = targetUri.size() 

930 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

931 

932 return StoredFileInfo(formatter=formatter, path=targetPath, 

933 storageClass=ref.datasetType.storageClass, 

934 component=ref.datasetType.component(), 

935 file_size=size, checksum=checksum) 

936 

937 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

938 # Docstring inherited from Datastore._prepIngest. 

939 filtered = [] 

940 for dataset in datasets: 

941 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

942 if not acceptable: 

943 continue 

944 else: 

945 dataset.refs = acceptable 

946 if dataset.formatter is None: 

947 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

948 else: 

949 assert isinstance(dataset.formatter, (type, str)) 

950 formatter_class = get_class_of(dataset.formatter) 

951 if not issubclass(formatter_class, Formatter): 951 ↛ 952line 951 didn't jump to line 952, because the condition on line 951 was never true

952 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

953 dataset.formatter = formatter_class 

954 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

955 filtered.append(dataset) 

956 return _IngestPrepData(filtered) 

957 

958 @transactional 

959 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

960 # Docstring inherited from Datastore._finishIngest. 

961 refsAndInfos = [] 

962 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

963 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

964 # Do ingest as if the first dataset ref is associated with the file 

965 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

966 transfer=transfer) 

967 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

968 self._register_datasets(refsAndInfos) 

969 

970 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

971 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

972 """Given a source URI and a DatasetRef, determine the name the 

973 dataset will have inside datastore. 

974 

975 Parameters 

976 ---------- 

977 srcUri : `ButlerURI` 

978 URI to the source dataset file. 

979 ref : `DatasetRef` 

980 Ref associated with the newly-ingested dataset artifact. This 

981 is used to determine the name within the datastore. 

982 formatter : `Formatter` or Formatter class. 

983 Formatter to use for validation. Can be a class or an instance. 

984 

985 Returns 

986 ------- 

987 location : `Location` 

988 Target location for the newly-ingested dataset. 

989 """ 

990 # Ingesting a file from outside the datastore. 

991 # This involves a new name. 

992 template = self.templates.getTemplate(ref) 

993 location = self.locationFactory.fromPath(template.format(ref)) 

994 

995 # Get the extension 

996 ext = srcUri.getExtension() 

997 

998 # Update the destination to include that extension 

999 location.updateExtension(ext) 

1000 

1001 # Ask the formatter to validate this extension 

1002 formatter.validateExtension(location) 

1003 

1004 return location 

1005 

1006 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1007 """Write out in memory dataset to datastore. 

1008 

1009 Parameters 

1010 ---------- 

1011 inMemoryDataset : `object` 

1012 Dataset to write to datastore. 

1013 ref : `DatasetRef` 

1014 Registry information associated with this dataset. 

1015 

1016 Returns 

1017 ------- 

1018 info : `StoredFileInfo` 

1019 Information describin the artifact written to the datastore. 

1020 """ 

1021 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1022 uri = location.uri 

1023 

1024 if not uri.dirname().exists(): 

1025 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1026 uri.dirname().mkdir() 

1027 

1028 if self._transaction is None: 1028 ↛ 1029line 1028 didn't jump to line 1029, because the condition on line 1028 was never true

1029 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1030 

1031 def _removeFileExists(uri: ButlerURI) -> None: 

1032 """Remove a file and do not complain if it is not there. 

1033 

1034 This is important since a formatter might fail before the file 

1035 is written and we should not confuse people by writing spurious 

1036 error messages to the log. 

1037 """ 

1038 try: 

1039 uri.remove() 

1040 except FileNotFoundError: 

1041 pass 

1042 

1043 # Register a callback to try to delete the uploaded data if 

1044 # something fails below 

1045 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1046 

1047 # For a local file, simply use the formatter directly 

1048 if uri.isLocal: 

1049 try: 

1050 formatter.write(inMemoryDataset) 

1051 except Exception as e: 

1052 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} " 

1053 f"to location {uri}") from e 

1054 log.debug("Successfully wrote python object to local file at %s", uri) 

1055 else: 

1056 # This is a remote URI. Some datasets can be serialized directly 

1057 # to bytes and sent to the remote datastore without writing a 

1058 # file. If the dataset is intended to be saved to the cache 

1059 # a file is always written and direct write to the remote 

1060 # datastore is bypassed. 

1061 data_written = False 

1062 if not self.cacheManager.should_be_cached(ref): 

1063 try: 

1064 serializedDataset = formatter.toBytes(inMemoryDataset) 

1065 except NotImplementedError: 

1066 # Fallback to the file writing option. 

1067 pass 

1068 except Exception as e: 

1069 raise RuntimeError(f"Failed to serialize dataset {ref} " 

1070 f"of type {type(inMemoryDataset)} to bytes.") from e 

1071 else: 

1072 log.debug("Writing bytes directly to %s", uri) 

1073 uri.write(serializedDataset, overwrite=True) 

1074 log.debug("Successfully wrote bytes directly to %s", uri) 

1075 data_written = True 

1076 

1077 if not data_written: 

1078 # Did not write the bytes directly to object store so instead 

1079 # write to temporary file. 

1080 with ButlerURI.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1081 # Need to configure the formatter to write to a different 

1082 # location and that needs us to overwrite internals 

1083 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1084 with formatter._updateLocation(Location(None, temporary_uri)): 

1085 try: 

1086 formatter.write(inMemoryDataset) 

1087 except Exception as e: 

1088 raise RuntimeError(f"Failed to serialize dataset {ref} of type" 

1089 f" {type(inMemoryDataset)} to " 

1090 f"temporary location {temporary_uri}") from e 

1091 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1092 

1093 # Cache if required 

1094 self.cacheManager.move_to_cache(temporary_uri, ref) 

1095 

1096 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1097 

1098 # URI is needed to resolve what ingest case are we dealing with 

1099 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1100 

1101 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1102 ref: DatasetRef, isComponent: bool = False, 

1103 cache_ref: Optional[DatasetRef] = None) -> Any: 

1104 """Read the artifact from datastore into in memory object. 

1105 

1106 Parameters 

1107 ---------- 

1108 getInfo : `DatastoreFileGetInformation` 

1109 Information about the artifact within the datastore. 

1110 ref : `DatasetRef` 

1111 The registry information associated with this artifact. 

1112 isComponent : `bool` 

1113 Flag to indicate if a component is being read from this artifact. 

1114 cache_ref : `DatasetRef`, optional 

1115 The DatasetRef to use when looking up the file in the cache. 

1116 This ref must have the same ID as the supplied ref but can 

1117 be a parent ref or component ref to indicate to the cache whether 

1118 a composite file is being requested from the cache or a component 

1119 file. Without this the cache will default to the supplied ref but 

1120 it can get confused with read-only derived components for 

1121 disassembled composites. 

1122 

1123 Returns 

1124 ------- 

1125 inMemoryDataset : `object` 

1126 The artifact as a python object. 

1127 """ 

1128 location = getInfo.location 

1129 uri = location.uri 

1130 log.debug("Accessing data from %s", uri) 

1131 

1132 if cache_ref is None: 

1133 cache_ref = ref 

1134 if cache_ref.id != ref.id: 1134 ↛ 1135line 1134 didn't jump to line 1135, because the condition on line 1134 was never true

1135 raise ValueError("The supplied cache dataset ref refers to a different dataset than expected:" 

1136 f" {ref.id} != {cache_ref.id}") 

1137 

1138 # Cannot recalculate checksum but can compare size as a quick check 

1139 # Do not do this if the size is negative since that indicates 

1140 # we do not know. 

1141 recorded_size = getInfo.info.file_size 

1142 resource_size = uri.size() 

1143 if recorded_size >= 0 and resource_size != recorded_size: 1143 ↛ 1144line 1143 didn't jump to line 1144, because the condition on line 1143 was never true

1144 raise RuntimeError("Integrity failure in Datastore. " 

1145 f"Size of file {uri} ({resource_size}) " 

1146 f"does not match size recorded in registry of {recorded_size}") 

1147 

1148 # For the general case we have choices for how to proceed. 

1149 # 1. Always use a local file (downloading the remote resource to a 

1150 # temporary file if needed). 

1151 # 2. Use a threshold size and read into memory and use bytes. 

1152 # Use both for now with an arbitrary hand off size. 

1153 # This allows small datasets to be downloaded from remote object 

1154 # stores without requiring a temporary file. 

1155 

1156 formatter = getInfo.formatter 

1157 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1158 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1159 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1160 if cached_file is not None: 

1161 desired_uri = cached_file 

1162 msg = f" (cached version of {uri})" 

1163 else: 

1164 desired_uri = uri 

1165 msg = "" 

1166 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1167 serializedDataset = desired_uri.read() 

1168 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1169 f"component {getInfo.component}" if isComponent else "", 

1170 len(serializedDataset), uri, formatter.name()) 

1171 try: 

1172 result = formatter.fromBytes(serializedDataset, 

1173 component=getInfo.component if isComponent else None) 

1174 except Exception as e: 

1175 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1176 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1177 else: 

1178 # Read from file. 

1179 

1180 # Have to update the Location associated with the formatter 

1181 # because formatter.read does not allow an override. 

1182 # This could be improved. 

1183 location_updated = False 

1184 msg = "" 

1185 

1186 # First check in cache for local version. 

1187 # The cache will only be relevant for remote resources but 

1188 # no harm in always asking. Context manager ensures that cache 

1189 # file is not deleted during cache expiration. 

1190 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1191 if cached_file is not None: 

1192 msg = f"(via cache read of remote file {uri})" 

1193 uri = cached_file 

1194 location_updated = True 

1195 

1196 with uri.as_local() as local_uri: 

1197 

1198 can_be_cached = False 

1199 if uri != local_uri: 1199 ↛ 1201line 1199 didn't jump to line 1201, because the condition on line 1199 was never true

1200 # URI was remote and file was downloaded 

1201 cache_msg = "" 

1202 location_updated = True 

1203 

1204 if self.cacheManager.should_be_cached(cache_ref): 

1205 # In this scenario we want to ask if the downloaded 

1206 # file should be cached but we should not cache 

1207 # it until after we've used it (to ensure it can't 

1208 # be expired whilst we are using it). 

1209 can_be_cached = True 

1210 

1211 # Say that it is "likely" to be cached because 

1212 # if the formatter read fails we will not be 

1213 # caching this file. 

1214 cache_msg = " and likely cached" 

1215 

1216 msg = f"(via download to local file{cache_msg})" 

1217 

1218 # Calculate the (possibly) new location for the formatter 

1219 # to use. 

1220 newLocation = Location(*local_uri.split()) if location_updated else None 

1221 

1222 log.debug("Reading%s from location %s %s with formatter %s", 

1223 f" component {getInfo.component}" if isComponent else "", 

1224 uri, msg, formatter.name()) 

1225 try: 

1226 with formatter._updateLocation(newLocation): 

1227 with time_this(log, msg="Reading%s from location %s %s with formatter %s", 

1228 args=(f" component {getInfo.component}" if isComponent else "", 

1229 uri, msg, formatter.name())): 

1230 result = formatter.read(component=getInfo.component if isComponent else None) 

1231 except Exception as e: 

1232 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1233 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1234 

1235 # File was read successfully so can move to cache 

1236 if can_be_cached: 1236 ↛ 1237line 1236 didn't jump to line 1237, because the condition on line 1236 was never true

1237 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1238 

1239 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1240 isComponent=isComponent) 

1241 

1242 def knows(self, ref: DatasetRef) -> bool: 

1243 """Check if the dataset is known to the datastore. 

1244 

1245 Does not check for existence of any artifact. 

1246 

1247 Parameters 

1248 ---------- 

1249 ref : `DatasetRef` 

1250 Reference to the required dataset. 

1251 

1252 Returns 

1253 ------- 

1254 exists : `bool` 

1255 `True` if the dataset is known to the datastore. 

1256 """ 

1257 fileLocations = self._get_dataset_locations_info(ref) 

1258 if fileLocations: 

1259 return True 

1260 return False 

1261 

1262 def _process_mexists_records(self, id_to_ref: Dict[DatasetId, DatasetRef], 

1263 records: Dict[DatasetId, List[StoredFileInfo]], 

1264 all_required: bool, 

1265 artifact_existence: Optional[Dict[ButlerURI, 

1266 bool]] = None) -> Dict[DatasetRef, bool]: 

1267 """Helper function for mexists that checks the given records. 

1268 

1269 Parameters 

1270 ---------- 

1271 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1272 Mapping of the dataset ID to the dataset ref itself. 

1273 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1274 Records as generally returned by 

1275 ``_get_stored_records_associated_with_refs``. 

1276 all_required : `bool` 

1277 Flag to indicate whether existence requires all artifacts 

1278 associated with a dataset ID to exist or not for existence. 

1279 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional 

1280 Mapping of datastore artifact to existence. Updated by this 

1281 method with details of all artifacts tested. Can be `None` 

1282 if the caller is not interested. 

1283 

1284 Returns 

1285 ------- 

1286 existence : `dict` of [`DatasetRef`, `bool`] 

1287 Mapping from dataset to boolean indicating existence. 

1288 """ 

1289 # The URIs to be checked and a mapping of those URIs to 

1290 # the dataset ID. 

1291 uris_to_check: List[ButlerURI] = [] 

1292 location_map: Dict[ButlerURI, DatasetId] = {} 

1293 

1294 location_factory = self.locationFactory 

1295 

1296 for ref_id, info in records.items(): 

1297 # Key is the dataId, value is list of StoredItemInfo 

1298 uris = [info.file_location(location_factory).uri for info in info] 

1299 uris_to_check.extend(uris) 

1300 location_map.update({uri: ref_id for uri in uris}) 

1301 

1302 uri_existence: Dict[ButlerURI, bool] = {} 

1303 if artifact_existence is not None: 

1304 # If a URI has already been checked remove it from the list 

1305 # and immediately add the status to the output dict. 

1306 filtered_uris_to_check = [] 

1307 for uri in uris_to_check: 

1308 if uri in artifact_existence: 

1309 uri_existence[uri] = artifact_existence[uri] 

1310 else: 

1311 filtered_uris_to_check.append(uri) 

1312 uris_to_check = filtered_uris_to_check 

1313 

1314 # Results. 

1315 dataset_existence: Dict[DatasetRef, bool] = {} 

1316 

1317 uri_existence.update(ButlerURI.mexists(uris_to_check)) 

1318 for uri, exists in uri_existence.items(): 

1319 dataset_id = location_map[uri] 

1320 ref = id_to_ref[dataset_id] 

1321 

1322 # Disassembled composite needs to check all locations. 

1323 # all_required indicates whether all need to exist or not. 

1324 if ref in dataset_existence: 

1325 if all_required: 

1326 exists = dataset_existence[ref] and exists 

1327 else: 

1328 exists = dataset_existence[ref] or exists 

1329 dataset_existence[ref] = exists 

1330 

1331 if artifact_existence is not None: 

1332 artifact_existence.update(uri_existence) 

1333 

1334 return dataset_existence 

1335 

1336 def mexists(self, refs: Iterable[DatasetRef], 

1337 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]: 

1338 """Check the existence of multiple datasets at once. 

1339 

1340 Parameters 

1341 ---------- 

1342 refs : iterable of `DatasetRef` 

1343 The datasets to be checked. 

1344 artifact_existence : `dict` of [`ButlerURI`, `bool`], optional 

1345 Mapping of datastore artifact to existence. Updated by this 

1346 method with details of all artifacts tested. Can be `None` 

1347 if the caller is not interested. 

1348 

1349 Returns 

1350 ------- 

1351 existence : `dict` of [`DatasetRef`, `bool`] 

1352 Mapping from dataset to boolean indicating existence. 

1353 """ 

1354 chunk_size = 10_000 

1355 dataset_existence: Dict[DatasetRef, bool] = {} 

1356 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", 

1357 chunk_size) 

1358 n_found_total = 0 

1359 n_checked = 0 

1360 n_chunks = 0 

1361 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1362 chunk_result = self._mexists(chunk, artifact_existence) 

1363 if log.isEnabledFor(VERBOSE): 

1364 n_results = len(chunk_result) 

1365 n_checked += n_results 

1366 # Can treat the booleans as 0, 1 integers and sum them. 

1367 n_found = sum(chunk_result.values()) 

1368 n_found_total += n_found 

1369 log.verbose("Number of datasets found in datastore for chunk %d = %d/%d" 

1370 " (running total: %d/%d)", 

1371 n_chunks, n_found, n_results, n_found_total, n_checked) 

1372 dataset_existence.update(chunk_result) 

1373 n_chunks += 1 

1374 

1375 return dataset_existence 

1376 

1377 def _mexists(self, refs: Iterable[DatasetRef], 

1378 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> Dict[DatasetRef, bool]: 

1379 """Check the existence of multiple datasets at once. 

1380 

1381 Parameters 

1382 ---------- 

1383 refs : iterable of `DatasetRef` 

1384 The datasets to be checked. 

1385 

1386 Returns 

1387 ------- 

1388 existence : `dict` of [`DatasetRef`, `bool`] 

1389 Mapping from dataset to boolean indicating existence. 

1390 """ 

1391 # Need a mapping of dataset_id to dataset ref since the API 

1392 # works with dataset_id 

1393 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1394 

1395 # Set of all IDs we are checking for. 

1396 requested_ids = set(id_to_ref.keys()) 

1397 

1398 # The records themselves. Could be missing some entries. 

1399 records = self._get_stored_records_associated_with_refs(refs) 

1400 

1401 dataset_existence = self._process_mexists_records(id_to_ref, records, True, 

1402 artifact_existence=artifact_existence) 

1403 

1404 # Set of IDs that have been handled. 

1405 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1406 

1407 missing_ids = requested_ids - handled_ids 

1408 if missing_ids: 

1409 if not self.trustGetRequest: 

1410 # Must assume these do not exist 

1411 for missing in missing_ids: 

1412 dataset_existence[id_to_ref[missing]] = False 

1413 else: 

1414 log.debug("%d out of %d datasets were not known to datastore during initial existence check.", 

1415 len(missing_ids), len(requested_ids)) 

1416 

1417 # Construct data structure identical to that returned 

1418 # by _get_stored_records_associated_with_refs() but using 

1419 # guessed names. 

1420 records = {} 

1421 for missing in missing_ids: 

1422 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1423 records[missing] = [info for _, info in expected] 

1424 

1425 dataset_existence.update(self._process_mexists_records(id_to_ref, records, False, 

1426 artifact_existence=artifact_existence)) 

1427 

1428 return dataset_existence 

1429 

1430 def exists(self, ref: DatasetRef) -> bool: 

1431 """Check if the dataset exists in the datastore. 

1432 

1433 Parameters 

1434 ---------- 

1435 ref : `DatasetRef` 

1436 Reference to the required dataset. 

1437 

1438 Returns 

1439 ------- 

1440 exists : `bool` 

1441 `True` if the entity exists in the `Datastore`. 

1442 """ 

1443 fileLocations = self._get_dataset_locations_info(ref) 

1444 

1445 # if we are being asked to trust that registry might not be correct 

1446 # we ask for the expected locations and check them explicitly 

1447 if not fileLocations: 

1448 if not self.trustGetRequest: 

1449 return False 

1450 

1451 # When we are guessing a dataset location we can not check 

1452 # for the existence of every component since we can not 

1453 # know if every component was written. Instead we check 

1454 # for the existence of any of the expected locations. 

1455 for location, _ in self._get_expected_dataset_locations_info(ref): 1455 ↛ 1458line 1455 didn't jump to line 1458, because the loop on line 1455 didn't complete

1456 if self._artifact_exists(location): 1456 ↛ 1455line 1456 didn't jump to line 1455, because the condition on line 1456 was never false

1457 return True 

1458 return False 

1459 

1460 # All listed artifacts must exist. 

1461 for location, _ in fileLocations: 

1462 if not self._artifact_exists(location): 

1463 return False 

1464 

1465 return True 

1466 

1467 def getURIs(self, ref: DatasetRef, 

1468 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1469 """Return URIs associated with dataset. 

1470 

1471 Parameters 

1472 ---------- 

1473 ref : `DatasetRef` 

1474 Reference to the required dataset. 

1475 predict : `bool`, optional 

1476 If the datastore does not know about the dataset, should it 

1477 return a predicted URI or not? 

1478 

1479 Returns 

1480 ------- 

1481 primary : `ButlerURI` 

1482 The URI to the primary artifact associated with this dataset. 

1483 If the dataset was disassembled within the datastore this 

1484 may be `None`. 

1485 components : `dict` 

1486 URIs to any components associated with the dataset artifact. 

1487 Can be empty if there are no components. 

1488 """ 

1489 

1490 primary: Optional[ButlerURI] = None 

1491 components: Dict[str, ButlerURI] = {} 

1492 

1493 # if this has never been written then we have to guess 

1494 if not self.exists(ref): 

1495 if not predict: 

1496 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1497 

1498 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1499 

1500 if doDisassembly: 

1501 

1502 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1503 compRef = ref.makeComponentRef(component) 

1504 compLocation, _ = self._determine_put_formatter_location(compRef) 

1505 

1506 # Add a URI fragment to indicate this is a guess 

1507 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1508 

1509 else: 

1510 

1511 location, _ = self._determine_put_formatter_location(ref) 

1512 

1513 # Add a URI fragment to indicate this is a guess 

1514 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1515 

1516 return primary, components 

1517 

1518 # If this is a ref that we have written we can get the path. 

1519 # Get file metadata and internal metadata 

1520 fileLocations = self._get_dataset_locations_info(ref) 

1521 

1522 guessing = False 

1523 if not fileLocations: 

1524 if not self.trustGetRequest: 1524 ↛ 1525line 1524 didn't jump to line 1525, because the condition on line 1524 was never true

1525 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1526 fileLocations = self._get_expected_dataset_locations_info(ref) 

1527 guessing = True 

1528 

1529 if len(fileLocations) == 1: 

1530 # No disassembly so this is the primary URI 

1531 uri = fileLocations[0][0].uri 

1532 if guessing and not uri.exists(): 1532 ↛ 1533line 1532 didn't jump to line 1533, because the condition on line 1532 was never true

1533 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1534 primary = uri 

1535 

1536 else: 

1537 for location, storedFileInfo in fileLocations: 

1538 if storedFileInfo.component is None: 1538 ↛ 1539line 1538 didn't jump to line 1539, because the condition on line 1538 was never true

1539 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1540 uri = location.uri 

1541 if guessing and not uri.exists(): 1541 ↛ 1545line 1541 didn't jump to line 1545, because the condition on line 1541 was never true

1542 # If we are trusting then it is entirely possible for 

1543 # some components to be missing. In that case we skip 

1544 # to the next component. 

1545 if self.trustGetRequest: 

1546 continue 

1547 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1548 components[storedFileInfo.component] = uri 

1549 

1550 return primary, components 

1551 

1552 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1553 """URI to the Dataset. 

1554 

1555 Parameters 

1556 ---------- 

1557 ref : `DatasetRef` 

1558 Reference to the required Dataset. 

1559 predict : `bool` 

1560 If `True`, allow URIs to be returned of datasets that have not 

1561 been written. 

1562 

1563 Returns 

1564 ------- 

1565 uri : `str` 

1566 URI pointing to the dataset within the datastore. If the 

1567 dataset does not exist in the datastore, and if ``predict`` is 

1568 `True`, the URI will be a prediction and will include a URI 

1569 fragment "#predicted". 

1570 If the datastore does not have entities that relate well 

1571 to the concept of a URI the returned URI will be 

1572 descriptive. The returned URI is not guaranteed to be obtainable. 

1573 

1574 Raises 

1575 ------ 

1576 FileNotFoundError 

1577 Raised if a URI has been requested for a dataset that does not 

1578 exist and guessing is not allowed. 

1579 RuntimeError 

1580 Raised if a request is made for a single URI but multiple URIs 

1581 are associated with this dataset. 

1582 

1583 Notes 

1584 ----- 

1585 When a predicted URI is requested an attempt will be made to form 

1586 a reasonable URI based on file templates and the expected formatter. 

1587 """ 

1588 primary, components = self.getURIs(ref, predict) 

1589 if primary is None or components: 1589 ↛ 1590line 1589 didn't jump to line 1590, because the condition on line 1589 was never true

1590 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1591 "Use Dataastore.getURIs() instead.") 

1592 return primary 

1593 

1594 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1595 destination: ButlerURI, transfer: str = "auto", 

1596 preserve_path: bool = True, 

1597 overwrite: bool = False) -> List[ButlerURI]: 

1598 """Retrieve the file artifacts associated with the supplied refs. 

1599 

1600 Parameters 

1601 ---------- 

1602 refs : iterable of `DatasetRef` 

1603 The datasets for which file artifacts are to be retrieved. 

1604 A single ref can result in multiple files. The refs must 

1605 be resolved. 

1606 destination : `ButlerURI` 

1607 Location to write the file artifacts. 

1608 transfer : `str`, optional 

1609 Method to use to transfer the artifacts. Must be one of the options 

1610 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1611 preserve_path : `bool`, optional 

1612 If `True` the full path of the file artifact within the datastore 

1613 is preserved. If `False` the final file component of the path 

1614 is used. 

1615 overwrite : `bool`, optional 

1616 If `True` allow transfers to overwrite existing files at the 

1617 destination. 

1618 

1619 Returns 

1620 ------- 

1621 targets : `list` of `ButlerURI` 

1622 URIs of file artifacts in destination location. Order is not 

1623 preserved. 

1624 """ 

1625 if not destination.isdir(): 1625 ↛ 1626line 1625 didn't jump to line 1626, because the condition on line 1625 was never true

1626 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1627 

1628 if transfer == "move": 

1629 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1630 

1631 # Source -> Destination 

1632 # This also helps filter out duplicate DatasetRef in the request 

1633 # that will map to the same underlying file transfer. 

1634 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1635 

1636 for ref in refs: 

1637 locations = self._get_dataset_locations_info(ref) 

1638 for location, _ in locations: 

1639 source_uri = location.uri 

1640 target_path: Union[str, ButlerURI] 

1641 if preserve_path: 

1642 target_path = location.pathInStore 

1643 if target_path.isabs(): 1643 ↛ 1646line 1643 didn't jump to line 1646, because the condition on line 1643 was never true

1644 # This is an absolute path to an external file. 

1645 # Use the full path. 

1646 target_path = target_path.relativeToPathRoot 

1647 else: 

1648 target_path = source_uri.basename() 

1649 target_uri = destination.join(target_path) 

1650 to_transfer[source_uri] = target_uri 

1651 

1652 # In theory can now parallelize the transfer 

1653 log.debug("Number of artifacts to transfer to %s: %d", 

1654 str(destination), len(to_transfer)) 

1655 for source_uri, target_uri in to_transfer.items(): 

1656 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1657 

1658 return list(to_transfer.values()) 

1659 

1660 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1661 """Load an InMemoryDataset from the store. 

1662 

1663 Parameters 

1664 ---------- 

1665 ref : `DatasetRef` 

1666 Reference to the required Dataset. 

1667 parameters : `dict` 

1668 `StorageClass`-specific parameters that specify, for example, 

1669 a slice of the dataset to be loaded. 

1670 

1671 Returns 

1672 ------- 

1673 inMemoryDataset : `object` 

1674 Requested dataset or slice thereof as an InMemoryDataset. 

1675 

1676 Raises 

1677 ------ 

1678 FileNotFoundError 

1679 Requested dataset can not be retrieved. 

1680 TypeError 

1681 Return value from formatter has unexpected type. 

1682 ValueError 

1683 Formatter failed to process the dataset. 

1684 """ 

1685 allGetInfo = self._prepare_for_get(ref, parameters) 

1686 refComponent = ref.datasetType.component() 

1687 

1688 # Supplied storage class for the component being read 

1689 refStorageClass = ref.datasetType.storageClass 

1690 

1691 # Create mapping from component name to related info 

1692 allComponents = {i.component: i for i in allGetInfo} 

1693 

1694 # By definition the dataset is disassembled if we have more 

1695 # than one record for it. 

1696 isDisassembled = len(allGetInfo) > 1 

1697 

1698 # Look for the special case where we are disassembled but the 

1699 # component is a derived component that was not written during 

1700 # disassembly. For this scenario we need to check that the 

1701 # component requested is listed as a derived component for the 

1702 # composite storage class 

1703 isDisassembledReadOnlyComponent = False 

1704 if isDisassembled and refComponent: 

1705 # The composite storage class should be accessible through 

1706 # the component dataset type 

1707 compositeStorageClass = ref.datasetType.parentStorageClass 

1708 

1709 # In the unlikely scenario where the composite storage 

1710 # class is not known, we can only assume that this is a 

1711 # normal component. If that assumption is wrong then the 

1712 # branch below that reads a persisted component will fail 

1713 # so there is no need to complain here. 

1714 if compositeStorageClass is not None: 1714 ↛ 1717line 1714 didn't jump to line 1717, because the condition on line 1714 was never false

1715 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1716 

1717 if isDisassembled and not refComponent: 

1718 # This was a disassembled dataset spread over multiple files 

1719 # and we need to put them all back together again. 

1720 # Read into memory and then assemble 

1721 

1722 # Check that the supplied parameters are suitable for the type read 

1723 refStorageClass.validateParameters(parameters) 

1724 

1725 # We want to keep track of all the parameters that were not used 

1726 # by formatters. We assume that if any of the component formatters 

1727 # use a parameter that we do not need to apply it again in the 

1728 # assembler. 

1729 usedParams = set() 

1730 

1731 components: Dict[str, Any] = {} 

1732 for getInfo in allGetInfo: 

1733 # assemblerParams are parameters not understood by the 

1734 # associated formatter. 

1735 usedParams.update(set(getInfo.formatterParams)) 

1736 

1737 component = getInfo.component 

1738 

1739 if component is None: 1739 ↛ 1740line 1739 didn't jump to line 1740, because the condition on line 1739 was never true

1740 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1741 

1742 # We do not want the formatter to think it's reading 

1743 # a component though because it is really reading a 

1744 # standalone dataset -- always tell reader it is not a 

1745 # component. 

1746 components[component] = self._read_artifact_into_memory(getInfo, 

1747 ref.makeComponentRef(component), 

1748 isComponent=False) 

1749 

1750 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1751 

1752 # Any unused parameters will have to be passed to the assembler 

1753 if parameters: 

1754 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1755 else: 

1756 unusedParams = {} 

1757 

1758 # Process parameters 

1759 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1760 parameters=unusedParams) 

1761 

1762 elif isDisassembledReadOnlyComponent: 

1763 

1764 compositeStorageClass = ref.datasetType.parentStorageClass 

1765 if compositeStorageClass is None: 1765 ↛ 1766line 1765 didn't jump to line 1766, because the condition on line 1765 was never true

1766 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1767 "no composite storage class is available.") 

1768 

1769 if refComponent is None: 1769 ↛ 1771line 1769 didn't jump to line 1771, because the condition on line 1769 was never true

1770 # Mainly for mypy 

1771 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1772 

1773 # Assume that every derived component can be calculated by 

1774 # forwarding the request to a single read/write component. 

1775 # Rather than guessing which rw component is the right one by 

1776 # scanning each for a derived component of the same name, 

1777 # we ask the storage class delegate directly which one is best to 

1778 # use. 

1779 compositeDelegate = compositeStorageClass.delegate() 

1780 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1781 set(allComponents)) 

1782 

1783 # Select the relevant component 

1784 rwInfo = allComponents[forwardedComponent] 

1785 

1786 # For now assume that read parameters are validated against 

1787 # the real component and not the requested component 

1788 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1789 forwardedStorageClass.validateParameters(parameters) 

1790 

1791 # The reference to use for the caching must refer to the forwarded 

1792 # component and not the derived component. 

1793 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1794 

1795 # Unfortunately the FileDescriptor inside the formatter will have 

1796 # the wrong write storage class so we need to create a new one 

1797 # given the immutability constraint. 

1798 writeStorageClass = rwInfo.info.storageClass 

1799 

1800 # We may need to put some thought into parameters for read 

1801 # components but for now forward them on as is 

1802 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1803 readStorageClass=refStorageClass, 

1804 storageClass=writeStorageClass, 

1805 parameters=parameters), 

1806 ref.dataId) 

1807 

1808 # The assembler can not receive any parameter requests for a 

1809 # derived component at this time since the assembler will 

1810 # see the storage class of the derived component and those 

1811 # parameters will have to be handled by the formatter on the 

1812 # forwarded storage class. 

1813 assemblerParams: Dict[str, Any] = {} 

1814 

1815 # Need to created a new info that specifies the derived 

1816 # component and associated storage class 

1817 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1818 rwInfo.info, assemblerParams, {}, 

1819 refComponent, refStorageClass) 

1820 

1821 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, 

1822 cache_ref=cache_ref) 

1823 

1824 else: 

1825 # Single file request or component from that composite file 

1826 for lookup in (refComponent, None): 1826 ↛ 1831line 1826 didn't jump to line 1831, because the loop on line 1826 didn't complete

1827 if lookup in allComponents: 1827 ↛ 1826line 1827 didn't jump to line 1826, because the condition on line 1827 was never false

1828 getInfo = allComponents[lookup] 

1829 break 

1830 else: 

1831 raise FileNotFoundError(f"Component {refComponent} not found " 

1832 f"for ref {ref} in datastore {self.name}") 

1833 

1834 # Do not need the component itself if already disassembled 

1835 if isDisassembled: 

1836 isComponent = False 

1837 else: 

1838 isComponent = getInfo.component is not None 

1839 

1840 # For a component read of a composite we want the cache to 

1841 # be looking at the composite ref itself. 

1842 cache_ref = ref.makeCompositeRef() if isComponent else ref 

1843 

1844 # For a disassembled component we can validate parametersagainst 

1845 # the component storage class directly 

1846 if isDisassembled: 

1847 refStorageClass.validateParameters(parameters) 

1848 else: 

1849 # For an assembled composite this could be a derived 

1850 # component derived from a real component. The validity 

1851 # of the parameters is not clear. For now validate against 

1852 # the composite storage class 

1853 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1854 

1855 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, 

1856 cache_ref=cache_ref) 

1857 

1858 @transactional 

1859 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1860 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1861 

1862 Parameters 

1863 ---------- 

1864 inMemoryDataset : `object` 

1865 The dataset to store. 

1866 ref : `DatasetRef` 

1867 Reference to the associated Dataset. 

1868 

1869 Raises 

1870 ------ 

1871 TypeError 

1872 Supplied object and storage class are inconsistent. 

1873 DatasetTypeNotSupportedError 

1874 The associated `DatasetType` is not handled by this datastore. 

1875 

1876 Notes 

1877 ----- 

1878 If the datastore is configured to reject certain dataset types it 

1879 is possible that the put will fail and raise a 

1880 `DatasetTypeNotSupportedError`. The main use case for this is to 

1881 allow `ChainedDatastore` to put to multiple datastores without 

1882 requiring that every datastore accepts the dataset. 

1883 """ 

1884 

1885 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1886 # doDisassembly = True 

1887 

1888 artifacts = [] 

1889 if doDisassembly: 

1890 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1891 for component, componentInfo in components.items(): 

1892 # Don't recurse because we want to take advantage of 

1893 # bulk insert -- need a new DatasetRef that refers to the 

1894 # same dataset_id but has the component DatasetType 

1895 # DatasetType does not refer to the types of components 

1896 # So we construct one ourselves. 

1897 compRef = ref.makeComponentRef(component) 

1898 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1899 artifacts.append((compRef, storedInfo)) 

1900 else: 

1901 # Write the entire thing out 

1902 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1903 artifacts.append((ref, storedInfo)) 

1904 

1905 self._register_datasets(artifacts) 

1906 

1907 @transactional 

1908 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

1909 # At this point can safely remove these datasets from the cache 

1910 # to avoid confusion later on. If they are not trashed later 

1911 # the cache will simply be refilled. 

1912 self.cacheManager.remove_from_cache(ref) 

1913 

1914 # If we are in trust mode there will be nothing to move to 

1915 # the trash table and we will have to try to delete the file 

1916 # immediately. 

1917 if self.trustGetRequest: 

1918 # Try to keep the logic below for a single file trash. 

1919 if isinstance(ref, DatasetRef): 

1920 refs = {ref} 

1921 else: 

1922 # Will recreate ref at the end of this branch. 

1923 refs = set(ref) 

1924 

1925 # Determine which datasets are known to datastore directly. 

1926 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1927 existing_ids = self._get_stored_records_associated_with_refs(refs) 

1928 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

1929 

1930 missing = refs - existing_refs 

1931 if missing: 

1932 # Do an explicit existence check on these refs. 

1933 # We only care about the artifacts at this point and not 

1934 # the dataset existence. 

1935 artifact_existence: Dict[ButlerURI, bool] = {} 

1936 _ = self.mexists(missing, artifact_existence) 

1937 uris = [uri for uri, exists in artifact_existence.items() if exists] 

1938 

1939 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

1940 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

1941 for uri in uris: 

1942 try: 

1943 uri.remove() 

1944 except Exception as e: 

1945 if ignore_errors: 

1946 log.debug("Artifact %s could not be removed: %s", uri, e) 

1947 continue 

1948 raise 

1949 

1950 # There is no point asking the code below to remove refs we 

1951 # know are missing so update it with the list of existing 

1952 # records. Try to retain one vs many logic. 

1953 if not existing_refs: 

1954 # Nothing more to do since none of the datasets were 

1955 # known to the datastore record table. 

1956 return 

1957 ref = list(existing_refs) 

1958 if len(ref) == 1: 

1959 ref = ref[0] 

1960 

1961 # Get file metadata and internal metadata 

1962 if not isinstance(ref, DatasetRef): 

1963 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

1964 # Assumed to be an iterable of refs so bulk mode enabled. 

1965 try: 

1966 self.bridge.moveToTrash(ref) 

1967 except Exception as e: 

1968 if ignore_errors: 

1969 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

1970 else: 

1971 raise 

1972 return 

1973 

1974 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

1975 

1976 fileLocations = self._get_dataset_locations_info(ref) 

1977 

1978 if not fileLocations: 

1979 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1980 if ignore_errors: 

1981 log.warning(err_msg) 

1982 return 

1983 else: 

1984 raise FileNotFoundError(err_msg) 

1985 

1986 for location, storedFileInfo in fileLocations: 

1987 if not self._artifact_exists(location): 1987 ↛ 1988line 1987 didn't jump to line 1988, because the condition on line 1987 was never true

1988 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1989 f"associated artifact ({location.uri}) is missing" 

1990 if ignore_errors: 

1991 log.warning(err_msg) 

1992 return 

1993 else: 

1994 raise FileNotFoundError(err_msg) 

1995 

1996 # Mark dataset as trashed 

1997 try: 

1998 self.bridge.moveToTrash([ref]) 

1999 except Exception as e: 

2000 if ignore_errors: 

2001 log.warning("Attempted to mark dataset (%s) to be trashed in datastore %s " 

2002 "but encountered an error: %s", ref, self.name, e) 

2003 pass 

2004 else: 

2005 raise 

2006 

2007 @transactional 

2008 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2009 """Remove all datasets from the trash. 

2010 

2011 Parameters 

2012 ---------- 

2013 ignore_errors : `bool` 

2014 If `True` return without error even if something went wrong. 

2015 Problems could occur if another process is simultaneously trying 

2016 to delete. 

2017 """ 

2018 log.debug("Emptying trash in datastore %s", self.name) 

2019 

2020 # Context manager will empty trash iff we finish it without raising. 

2021 # It will also automatically delete the relevant rows from the 

2022 # trash table and the records table. 

2023 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo, 

2024 record_column="path") as trash_data: 

2025 # Removing the artifacts themselves requires that the files are 

2026 # not also associated with refs that are not to be trashed. 

2027 # Therefore need to do a query with the file paths themselves 

2028 # and return all the refs associated with them. Can only delete 

2029 # a file if the refs to be trashed are the only refs associated 

2030 # with the file. 

2031 # This requires multiple copies of the trashed items 

2032 trashed, artifacts_to_keep = trash_data 

2033 

2034 if artifacts_to_keep is None: 

2035 # The bridge is not helping us so have to work it out 

2036 # ourselves. This is not going to be as efficient. 

2037 trashed = list(trashed) 

2038 

2039 # The instance check is for mypy since up to this point it 

2040 # does not know the type of info. 

2041 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed 

2042 if isinstance(info, StoredFileInfo)]) 

2043 

2044 for ref, info in trashed: 

2045 

2046 # Mypy needs to know this is not the base class 

2047 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2048 

2049 # Check for mypy 

2050 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2051 

2052 path_map[info.path].remove(ref.id) 

2053 if not path_map[info.path]: 2053 ↛ 2044line 2053 didn't jump to line 2044, because the condition on line 2053 was never false

2054 del path_map[info.path] 

2055 

2056 artifacts_to_keep = set(path_map) 

2057 

2058 for ref, info in trashed: 

2059 

2060 # Should not happen for this implementation but need 

2061 # to keep mypy happy. 

2062 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2063 

2064 # Mypy needs to know this is not the base class 

2065 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2066 

2067 # Check for mypy 

2068 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2069 

2070 if info.path in artifacts_to_keep: 

2071 # This is a multi-dataset artifact and we are not 

2072 # removing all associated refs. 

2073 continue 

2074 

2075 # Only trashed refs still known to datastore will be returned. 

2076 location = info.file_location(self.locationFactory) 

2077 

2078 # Point of no return for this artifact 

2079 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2080 try: 

2081 self._delete_artifact(location) 

2082 except FileNotFoundError: 

2083 # If the file itself has been deleted there is nothing 

2084 # we can do about it. It is possible that trash has 

2085 # been run in parallel in another process or someone 

2086 # decided to delete the file. It is unlikely to come 

2087 # back and so we should still continue with the removal 

2088 # of the entry from the trash table. It is also possible 

2089 # we removed it in a previous iteration if it was 

2090 # a multi-dataset artifact. The delete artifact method 

2091 # will log a debug message in this scenario. 

2092 # Distinguishing file missing before trash started and 

2093 # file already removed previously as part of this trash 

2094 # is not worth the distinction with regards to potential 

2095 # memory cost. 

2096 pass 

2097 except Exception as e: 

2098 if ignore_errors: 

2099 # Use a debug message here even though it's not 

2100 # a good situation. In some cases this can be 

2101 # caused by a race between user A and user B 

2102 # and neither of them has permissions for the 

2103 # other's files. Butler does not know about users 

2104 # and trash has no idea what collections these 

2105 # files were in (without guessing from a path). 

2106 log.debug("Encountered error removing artifact %s from datastore %s: %s", 

2107 location.uri, self.name, e) 

2108 else: 

2109 raise 

2110 

2111 @transactional 

2112 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef], 

2113 local_refs: Optional[Iterable[DatasetRef]] = None, 

2114 transfer: str = "auto", 

2115 artifact_existence: Optional[Dict[ButlerURI, bool]] = None) -> None: 

2116 # Docstring inherited 

2117 if type(self) is not type(source_datastore): 

2118 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the " 

2119 f"source datastore ({type(source_datastore)}).") 

2120 

2121 # Be explicit for mypy 

2122 if not isinstance(source_datastore, FileDatastore): 2122 ↛ 2123line 2122 didn't jump to line 2123, because the condition on line 2122 was never true

2123 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not" 

2124 f" {type(source_datastore)}") 

2125 

2126 # Stop early if "direct" transfer mode is requested. That would 

2127 # require that the URI inside the source datastore should be stored 

2128 # directly in the target datastore, which seems unlikely to be useful 

2129 # since at any moment the source datastore could delete the file. 

2130 if transfer in ("direct", "split"): 

2131 raise ValueError(f"Can not transfer from a source datastore using {transfer} mode since" 

2132 " those files are controlled by the other datastore.") 

2133 

2134 # Empty existence lookup if none given. 

2135 if artifact_existence is None: 

2136 artifact_existence = {} 

2137 

2138 # We will go through the list multiple times so must convert 

2139 # generators to lists. 

2140 refs = list(refs) 

2141 

2142 if local_refs is None: 

2143 local_refs = refs 

2144 else: 

2145 local_refs = list(local_refs) 

2146 

2147 # In order to handle disassembled composites the code works 

2148 # at the records level since it can assume that internal APIs 

2149 # can be used. 

2150 # - If the record already exists in the destination this is assumed 

2151 # to be okay. 

2152 # - If there is no record but the source and destination URIs are 

2153 # identical no transfer is done but the record is added. 

2154 # - If the source record refers to an absolute URI currently assume 

2155 # that that URI should remain absolute and will be visible to the 

2156 # destination butler. May need to have a flag to indicate whether 

2157 # the dataset should be transferred. This will only happen if 

2158 # the detached Butler has had a local ingest. 

2159 

2160 # What we really want is all the records in the source datastore 

2161 # associated with these refs. Or derived ones if they don't exist 

2162 # in the source. 

2163 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2164 

2165 # The source dataset_ids are the keys in these records 

2166 source_ids = set(source_records) 

2167 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2168 

2169 # The not None check is to appease mypy 

2170 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2171 missing_ids = requested_ids - source_ids 

2172 

2173 # Missing IDs can be okay if that datastore has allowed 

2174 # gets based on file existence. Should we transfer what we can 

2175 # or complain about it and warn? 

2176 if missing_ids and not source_datastore.trustGetRequest: 2176 ↛ 2177line 2176 didn't jump to line 2177, because the condition on line 2176 was never true

2177 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:" 

2178 f" {missing_ids}") 

2179 

2180 # Need to map these missing IDs to a DatasetRef so we can guess 

2181 # the details. 

2182 if missing_ids: 

2183 log.info("Number of expected datasets missing from source datastore records: %d out of %d", 

2184 len(missing_ids), len(requested_ids)) 

2185 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2186 

2187 # This should be chunked in case we end up having to check 

2188 # the file store since we need some log output to show 

2189 # progress. 

2190 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2191 records = {} 

2192 for missing in missing_ids_chunk: 

2193 # Ask the source datastore where the missing artifacts 

2194 # should be. An execution butler might not know about the 

2195 # artifacts even if they are there. 

2196 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2197 records[missing] = [info for _, info in expected] 

2198 

2199 # Call the mexist helper method in case we have not already 

2200 # checked these artifacts such that artifact_existence is 

2201 # empty. This allows us to benefit from parallelism. 

2202 # datastore.mexists() itself does not give us access to the 

2203 # derived datastore record. 

2204 log.verbose("Checking existence of %d datasets unknown to datastore", 

2205 len(records)) 

2206 ref_exists = source_datastore._process_mexists_records(id_to_ref, records, False, 

2207 artifact_existence=artifact_existence) 

2208 

2209 # Now go through the records and propagate the ones that exist. 

2210 location_factory = source_datastore.locationFactory 

2211 for missing, record_list in records.items(): 

2212 # Skip completely if the ref does not exist. 

2213 ref = id_to_ref[missing] 

2214 if not ref_exists[ref]: 

2215 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", 

2216 ref) 

2217 continue 

2218 # Check for file artifact to decide which parts of a 

2219 # disassembled composite do exist. If there is only a 

2220 # single record we don't even need to look because it can't 

2221 # be a composite and must exist. 

2222 if len(record_list) == 1: 

2223 dataset_records = record_list 

2224 else: 

2225 dataset_records = [record for record in record_list 

2226 if artifact_existence[record.file_location(location_factory).uri]] 

2227 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2228 

2229 # Rely on source_records being a defaultdict. 

2230 source_records[missing].extend(dataset_records) 

2231 

2232 # See if we already have these records 

2233 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2234 

2235 # The artifacts to register 

2236 artifacts = [] 

2237 

2238 # Refs that already exist 

2239 already_present = [] 

2240 

2241 # Now can transfer the artifacts 

2242 for source_ref, target_ref in zip(refs, local_refs): 

2243 if target_ref.id in target_records: 

2244 # Already have an artifact for this. 

2245 already_present.append(target_ref) 

2246 continue 

2247 

2248 # mypy needs to know these are always resolved refs 

2249 for info in source_records[source_ref.getCheckedId()]: 

2250 source_location = info.file_location(source_datastore.locationFactory) 

2251 target_location = info.file_location(self.locationFactory) 

2252 if source_location == target_location: 2252 ↛ 2256line 2252 didn't jump to line 2256, because the condition on line 2252 was never true

2253 # Either the dataset is already in the target datastore 

2254 # (which is how execution butler currently runs) or 

2255 # it is an absolute URI. 

2256 if source_location.pathInStore.isabs(): 

2257 # Just because we can see the artifact when running 

2258 # the transfer doesn't mean it will be generally 

2259 # accessible to a user of this butler. For now warn 

2260 # but assume it will be accessible. 

2261 log.warning("Transfer request for an outside-datastore artifact has been found at %s", 

2262 source_location) 

2263 else: 

2264 # Need to transfer it to the new location. 

2265 # Assume we should always overwrite. If the artifact 

2266 # is there this might indicate that a previous transfer 

2267 # was interrupted but was not able to be rolled back 

2268 # completely (eg pre-emption) so follow Datastore default 

2269 # and overwrite. 

2270 target_location.uri.transfer_from(source_location.uri, transfer=transfer, 

2271 overwrite=True, transaction=self._transaction) 

2272 

2273 artifacts.append((target_ref, info)) 

2274 

2275 self._register_datasets(artifacts) 

2276 

2277 if already_present: 

2278 n_skipped = len(already_present) 

2279 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped, 

2280 "" if n_skipped == 1 else "s") 

2281 

2282 @transactional 

2283 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2284 # Docstring inherited. 

2285 refs = list(refs) 

2286 self.bridge.forget(refs) 

2287 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2288 

2289 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

2290 logFailures: bool = False) -> None: 

2291 """Validate some of the configuration for this datastore. 

2292 

2293 Parameters 

2294 ---------- 

2295 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2296 Entities to test against this configuration. Can be differing 

2297 types. 

2298 logFailures : `bool`, optional 

2299 If `True`, output a log message for every validation error 

2300 detected. 

2301 

2302 Raises 

2303 ------ 

2304 DatastoreValidationError 

2305 Raised if there is a validation problem with a configuration. 

2306 All the problems are reported in a single exception. 

2307 

2308 Notes 

2309 ----- 

2310 This method checks that all the supplied entities have valid file 

2311 templates and also have formatters defined. 

2312 """ 

2313 

2314 templateFailed = None 

2315 try: 

2316 self.templates.validateTemplates(entities, logFailures=logFailures) 

2317 except FileTemplateValidationError as e: 

2318 templateFailed = str(e) 

2319 

2320 formatterFailed = [] 

2321 for entity in entities: 

2322 try: 

2323 self.formatterFactory.getFormatterClass(entity) 

2324 except KeyError as e: 

2325 formatterFailed.append(str(e)) 

2326 if logFailures: 2326 ↛ 2321line 2326 didn't jump to line 2321, because the condition on line 2326 was never false

2327 log.critical("Formatter failure: %s", e) 

2328 

2329 if templateFailed or formatterFailed: 

2330 messages = [] 

2331 if templateFailed: 2331 ↛ 2332line 2331 didn't jump to line 2332, because the condition on line 2331 was never true

2332 messages.append(templateFailed) 

2333 if formatterFailed: 2333 ↛ 2335line 2333 didn't jump to line 2335, because the condition on line 2333 was never false

2334 messages.append(",".join(formatterFailed)) 

2335 msg = ";\n".join(messages) 

2336 raise DatastoreValidationError(msg) 

2337 

2338 def getLookupKeys(self) -> Set[LookupKey]: 

2339 # Docstring is inherited from base class 

2340 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

2341 self.constraints.getLookupKeys() 

2342 

2343 def validateKey(self, lookupKey: LookupKey, 

2344 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2345 # Docstring is inherited from base class 

2346 # The key can be valid in either formatters or templates so we can 

2347 # only check the template if it exists 

2348 if lookupKey in self.templates: 

2349 try: 

2350 self.templates[lookupKey].validateTemplate(entity) 

2351 except FileTemplateValidationError as e: 

2352 raise DatastoreValidationError(e) from e 

2353 

2354 def export(self, refs: Iterable[DatasetRef], *, 

2355 directory: Optional[Union[ButlerURI, str]] = None, 

2356 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

2357 # Docstring inherited from Datastore.export. 

2358 if transfer is not None and directory is None: 2358 ↛ 2359line 2358 didn't jump to line 2359, because the condition on line 2358 was never true

2359 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

2360 "export directory given") 

2361 

2362 # Force the directory to be a URI object 

2363 directoryUri: Optional[ButlerURI] = None 

2364 if directory is not None: 2364 ↛ 2367line 2364 didn't jump to line 2367, because the condition on line 2364 was never false

2365 directoryUri = ButlerURI(directory, forceDirectory=True) 

2366 

2367 if transfer is not None and directoryUri is not None: 2367 ↛ 2372line 2367 didn't jump to line 2372, because the condition on line 2367 was never false

2368 # mypy needs the second test 

2369 if not directoryUri.exists(): 2369 ↛ 2370line 2369 didn't jump to line 2370, because the condition on line 2369 was never true

2370 raise FileNotFoundError(f"Export location {directory} does not exist") 

2371 

2372 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2373 for ref in progress.wrap(refs, "Exporting dataset files"): 

2374 fileLocations = self._get_dataset_locations_info(ref) 

2375 if not fileLocations: 2375 ↛ 2376line 2375 didn't jump to line 2376, because the condition on line 2375 was never true

2376 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2377 # For now we can not export disassembled datasets 

2378 if len(fileLocations) > 1: 

2379 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2380 location, storedFileInfo = fileLocations[0] 

2381 

2382 pathInStore = location.pathInStore.path 

2383 if transfer is None: 2383 ↛ 2387line 2383 didn't jump to line 2387, because the condition on line 2383 was never true

2384 # TODO: do we also need to return the readStorageClass somehow? 

2385 # We will use the path in store directly. If this is an 

2386 # absolute URI, preserve it. 

2387 if location.pathInStore.isabs(): 

2388 pathInStore = str(location.uri) 

2389 elif transfer == "direct": 2389 ↛ 2391line 2389 didn't jump to line 2391, because the condition on line 2389 was never true

2390 # Use full URIs to the remote store in the export 

2391 pathInStore = str(location.uri) 

2392 else: 

2393 # mypy needs help 

2394 assert directoryUri is not None, "directoryUri must be defined to get here" 

2395 storeUri = ButlerURI(location.uri) 

2396 

2397 # if the datastore has an absolute URI to a resource, we 

2398 # have two options: 

2399 # 1. Keep the absolute URI in the exported YAML 

2400 # 2. Allocate a new name in the local datastore and transfer 

2401 # it. 

2402 # For now go with option 2 

2403 if location.pathInStore.isabs(): 2403 ↛ 2404line 2403 didn't jump to line 2404, because the condition on line 2403 was never true

2404 template = self.templates.getTemplate(ref) 

2405 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

2406 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2407 

2408 exportUri = directoryUri.join(pathInStore) 

2409 exportUri.transfer_from(storeUri, transfer=transfer) 

2410 

2411 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2412 

2413 @staticmethod 

2414 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

2415 """Compute the checksum of the supplied file. 

2416 

2417 Parameters 

2418 ---------- 

2419 uri : `ButlerURI` 

2420 Name of resource to calculate checksum from. 

2421 algorithm : `str`, optional 

2422 Name of algorithm to use. Must be one of the algorithms supported 

2423 by :py:class`hashlib`. 

2424 block_size : `int` 

2425 Number of bytes to read from file at one time. 

2426 

2427 Returns 

2428 ------- 

2429 hexdigest : `str` 

2430 Hex digest of the file. 

2431 

2432 Notes 

2433 ----- 

2434 Currently returns None if the URI is for a remote resource. 

2435 """ 

2436 if algorithm not in hashlib.algorithms_guaranteed: 2436 ↛ 2437line 2436 didn't jump to line 2437, because the condition on line 2436 was never true

2437 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2438 

2439 if not uri.isLocal: 2439 ↛ 2440line 2439 didn't jump to line 2440, because the condition on line 2439 was never true

2440 return None 

2441 

2442 hasher = hashlib.new(algorithm) 

2443 

2444 with uri.as_local() as local_uri: 

2445 with open(local_uri.ospath, "rb") as f: 

2446 for chunk in iter(lambda: f.read(block_size), b""): 

2447 hasher.update(chunk) 

2448 

2449 return hasher.hexdigest() 

2450 

2451 def needs_expanded_data_ids( 

2452 self, 

2453 transfer: Optional[str], 

2454 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2455 ) -> bool: 

2456 # Docstring inherited. 

2457 # This _could_ also use entity to inspect whether the filename template 

2458 # involves placeholders other than the required dimensions for its 

2459 # dataset type, but that's not necessary for correctness; it just 

2460 # enables more optimizations (perhaps only in theory). 

2461 return transfer not in ("direct", None)