Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from collections import defaultdict 

35from dataclasses import dataclass 

36from typing import ( 

37 TYPE_CHECKING, 

38 Any, 

39 ClassVar, 

40 Dict, 

41 Iterable, 

42 List, 

43 Mapping, 

44 Optional, 

45 Set, 

46 Tuple, 

47 Type, 

48 Union, 

49) 

50 

51from lsst.daf.butler import ( 

52 ButlerURI, 

53 CompositesMap, 

54 Config, 

55 FileDataset, 

56 DatasetId, 

57 DatasetRef, 

58 DatasetType, 

59 DatasetTypeNotSupportedError, 

60 Datastore, 

61 DatastoreCacheManager, 

62 DatastoreDisabledCacheManager, 

63 DatastoreConfig, 

64 DatastoreValidationError, 

65 FileDescriptor, 

66 FileTemplates, 

67 FileTemplateValidationError, 

68 Formatter, 

69 FormatterFactory, 

70 Location, 

71 LocationFactory, 

72 Progress, 

73 StorageClass, 

74 StoredFileInfo, 

75) 

76 

77from lsst.daf.butler import ddl 

78from lsst.daf.butler.registry.interfaces import ( 

79 ReadOnlyDatabaseError, 

80 DatastoreRegistryBridge, 

81) 

82 

83from lsst.daf.butler.core.repoRelocation import replaceRoot 

84from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional, time_this 

85from .genericDatastore import GenericBaseDatastore 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

90 

91log = logging.getLogger(__name__) 

92 

93 

94class _IngestPrepData(Datastore.IngestPrepData): 

95 """Helper class for FileDatastore ingest implementation. 

96 

97 Parameters 

98 ---------- 

99 datasets : `list` of `FileDataset` 

100 Files to be ingested by this datastore. 

101 """ 

102 def __init__(self, datasets: List[FileDataset]): 

103 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

104 self.datasets = datasets 

105 

106 

107@dataclass(frozen=True) 

108class DatastoreFileGetInformation: 

109 """Collection of useful parameters needed to retrieve a file from 

110 a Datastore. 

111 """ 

112 

113 location: Location 

114 """The location from which to read the dataset.""" 

115 

116 formatter: Formatter 

117 """The `Formatter` to use to deserialize the dataset.""" 

118 

119 info: StoredFileInfo 

120 """Stored information about this file and its formatter.""" 

121 

122 assemblerParams: Dict[str, Any] 

123 """Parameters to use for post-processing the retrieved dataset.""" 

124 

125 formatterParams: Dict[str, Any] 

126 """Parameters that were understood by the associated formatter.""" 

127 

128 component: Optional[str] 

129 """The component to be retrieved (can be `None`).""" 

130 

131 readStorageClass: StorageClass 

132 """The `StorageClass` of the dataset being read.""" 

133 

134 

135class FileDatastore(GenericBaseDatastore): 

136 """Generic Datastore for file-based implementations. 

137 

138 Should always be sub-classed since key abstract methods are missing. 

139 

140 Parameters 

141 ---------- 

142 config : `DatastoreConfig` or `str` 

143 Configuration as either a `Config` object or URI to file. 

144 bridgeManager : `DatastoreRegistryBridgeManager` 

145 Object that manages the interface between `Registry` and datastores. 

146 butlerRoot : `str`, optional 

147 New datastore root to use to override the configuration value. 

148 

149 Raises 

150 ------ 

151 ValueError 

152 If root location does not exist and ``create`` is `False` in the 

153 configuration. 

154 """ 

155 

156 defaultConfigFile: ClassVar[Optional[str]] = None 

157 """Path to configuration defaults. Accessed within the ``config`` resource 

158 or relative to a search path. Can be None if no defaults specified. 

159 """ 

160 

161 root: ButlerURI 

162 """Root directory URI of this `Datastore`.""" 

163 

164 locationFactory: LocationFactory 

165 """Factory for creating locations relative to the datastore root.""" 

166 

167 formatterFactory: FormatterFactory 

168 """Factory for creating instances of formatters.""" 

169 

170 templates: FileTemplates 

171 """File templates that can be used by this `Datastore`.""" 

172 

173 composites: CompositesMap 

174 """Determines whether a dataset should be disassembled on put.""" 

175 

176 defaultConfigFile = "datastores/fileDatastore.yaml" 

177 """Path to configuration defaults. Accessed within the ``config`` resource 

178 or relative to a search path. Can be None if no defaults specified. 

179 """ 

180 

181 @classmethod 

182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

183 """Set any filesystem-dependent config options for this Datastore to 

184 be appropriate for a new empty repository with the given root. 

185 

186 Parameters 

187 ---------- 

188 root : `str` 

189 URI to the root of the data repository. 

190 config : `Config` 

191 A `Config` to update. Only the subset understood by 

192 this component will be updated. Will not expand 

193 defaults. 

194 full : `Config` 

195 A complete config with all defaults expanded that can be 

196 converted to a `DatastoreConfig`. Read-only and will not be 

197 modified by this method. 

198 Repository-specific options that should not be obtained 

199 from defaults when Butler instances are constructed 

200 should be copied from ``full`` to ``config``. 

201 overwrite : `bool`, optional 

202 If `False`, do not modify a value in ``config`` if the value 

203 already exists. Default is always to overwrite with the provided 

204 ``root``. 

205 

206 Notes 

207 ----- 

208 If a keyword is explicitly defined in the supplied ``config`` it 

209 will not be overridden by this method if ``overwrite`` is `False`. 

210 This allows explicit values set in external configs to be retained. 

211 """ 

212 Config.updateParameters(DatastoreConfig, config, full, 

213 toUpdate={"root": root}, 

214 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

215 

216 @classmethod 

217 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

218 return ddl.TableSpec( 

219 fields=[ 

220 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

221 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

222 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

223 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

224 # Use empty string to indicate no component 

225 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

226 # TODO: should checksum be Base64Bytes instead? 

227 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

228 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

229 ], 

230 unique=frozenset(), 

231 indexes=[tuple(["path"])], 

232 ) 

233 

234 def __init__(self, config: Union[DatastoreConfig, str], 

235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

236 super().__init__(config, bridgeManager) 

237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise ValueError("No root directory specified in configuration") 

239 

240 # Name ourselves either using an explicit name or a name 

241 # derived from the (unexpanded) root 

242 if "name" in self.config: 

243 self.name = self.config["name"] 

244 else: 

245 # We use the unexpanded root in the name to indicate that this 

246 # datastore can be moved without having to update registry. 

247 self.name = "{}@{}".format(type(self).__name__, 

248 self.config["root"]) 

249 

250 # Support repository relocation in config 

251 # Existence of self.root is checked in subclass 

252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

253 forceDirectory=True, forceAbsolute=True) 

254 

255 self.locationFactory = LocationFactory(self.root) 

256 self.formatterFactory = FormatterFactory() 

257 

258 # Now associate formatters with storage classes 

259 self.formatterFactory.registerFormatters(self.config["formatters"], 

260 universe=bridgeManager.universe) 

261 

262 # Read the file naming templates 

263 self.templates = FileTemplates(self.config["templates"], 

264 universe=bridgeManager.universe) 

265 

266 # See if composites should be disassembled 

267 self.composites = CompositesMap(self.config["composites"], 

268 universe=bridgeManager.universe) 

269 

270 tableName = self.config["records", "table"] 

271 try: 

272 # Storage of paths and formatters, keyed by dataset_id 

273 self._table = bridgeManager.opaque.register( 

274 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)) 

275 # Interface to Registry. 

276 self._bridge = bridgeManager.register(self.name) 

277 except ReadOnlyDatabaseError: 

278 # If the database is read only and we just tried and failed to 

279 # create a table, it means someone is trying to create a read-only 

280 # butler client for an empty repo. That should be okay, as long 

281 # as they then try to get any datasets before some other client 

282 # creates the table. Chances are they'rejust validating 

283 # configuration. 

284 pass 

285 

286 # Determine whether checksums should be used - default to False 

287 self.useChecksum = self.config.get("checksum", False) 

288 

289 # Determine whether we can fall back to configuration if a 

290 # requested dataset is not known to registry 

291 self.trustGetRequest = self.config.get("trust_get_request", False) 

292 

293 # Create a cache manager 

294 self.cacheManager: AbstractDatastoreCacheManager 

295 if "cached" in self.config: 295 ↛ 299line 295 didn't jump to line 299, because the condition on line 295 was never false

296 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

297 universe=bridgeManager.universe) 

298 else: 

299 self.cacheManager = DatastoreDisabledCacheManager("", 

300 universe=bridgeManager.universe) 

301 

302 # Check existence and create directory structure if necessary 

303 if not self.root.exists(): 

304 if "create" not in self.config or not self.config["create"]: 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true

305 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

306 try: 

307 self.root.mkdir() 

308 except Exception as e: 

309 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

310 f" Got error: {e}") from e 

311 

312 def __str__(self) -> str: 

313 return str(self.root) 

314 

315 @property 

316 def bridge(self) -> DatastoreRegistryBridge: 

317 return self._bridge 

318 

319 def _artifact_exists(self, location: Location) -> bool: 

320 """Check that an artifact exists in this datastore at the specified 

321 location. 

322 

323 Parameters 

324 ---------- 

325 location : `Location` 

326 Expected location of the artifact associated with this datastore. 

327 

328 Returns 

329 ------- 

330 exists : `bool` 

331 True if the location can be found, false otherwise. 

332 """ 

333 log.debug("Checking if resource exists: %s", location.uri) 

334 return location.uri.exists() 

335 

336 def _delete_artifact(self, location: Location) -> None: 

337 """Delete the artifact from the datastore. 

338 

339 Parameters 

340 ---------- 

341 location : `Location` 

342 Location of the artifact associated with this datastore. 

343 """ 

344 if location.pathInStore.isabs(): 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true

345 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

346 

347 try: 

348 location.uri.remove() 

349 except FileNotFoundError: 

350 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

351 raise 

352 except Exception as e: 

353 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

354 raise 

355 log.debug("Successfully deleted file: %s", location.uri) 

356 

357 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

358 # Docstring inherited from GenericBaseDatastore 

359 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

360 self._table.insert(*records) 

361 

362 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

363 # Docstring inherited from GenericBaseDatastore 

364 

365 # Look for the dataset_id -- there might be multiple matches 

366 # if we have disassembled the dataset. 

367 records = self._table.fetch(dataset_id=ref.id) 

368 return [StoredFileInfo.from_record(record) for record in records] 

369 

370 def _get_stored_records_associated_with_refs(self, 

371 refs: Iterable[DatasetIdRef] 

372 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

373 """Retrieve all records associated with the provided refs. 

374 

375 Parameters 

376 ---------- 

377 refs : iterable of `DatasetIdRef` 

378 The refs for which records are to be retrieved. 

379 

380 Returns 

381 ------- 

382 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

383 The matching records indexed by the ref ID. The number of entries 

384 in the dict can be smaller than the number of requested refs. 

385 """ 

386 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

387 

388 # Uniqueness is dataset_id + component so can have multiple records 

389 # per ref. 

390 records_by_ref = defaultdict(list) 

391 for record in records: 

392 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

393 return records_by_ref 

394 

395 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str, 

396 Set[DatasetId]]: 

397 """Return paths and associated dataset refs. 

398 

399 Parameters 

400 ---------- 

401 paths : `list` of `str` or `ButlerURI` 

402 All the paths to include in search. 

403 

404 Returns 

405 ------- 

406 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

407 Mapping of each path to a set of associated database IDs. 

408 """ 

409 records = self._table.fetch(path=[str(path) for path in paths]) 

410 result = defaultdict(set) 

411 for row in records: 

412 result[row["path"]].add(row["dataset_id"]) 

413 return result 

414 

415 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

416 """Return all dataset refs associated with the supplied path. 

417 

418 Parameters 

419 ---------- 

420 pathInStore : `ButlerURI` 

421 Path of interest in the data store. 

422 

423 Returns 

424 ------- 

425 ids : `set` of `int` 

426 All `DatasetRef` IDs associated with this path. 

427 """ 

428 records = list(self._table.fetch(path=str(pathInStore))) 

429 ids = {r["dataset_id"] for r in records} 

430 return ids 

431 

432 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

433 # Docstring inherited from GenericBaseDatastore 

434 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

435 

436 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

437 r"""Find all the `Location`\ s of the requested dataset in the 

438 `Datastore` and the associated stored file information. 

439 

440 Parameters 

441 ---------- 

442 ref : `DatasetRef` 

443 Reference to the required `Dataset`. 

444 

445 Returns 

446 ------- 

447 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

448 Location of the dataset within the datastore and 

449 stored information about each file and its formatter. 

450 """ 

451 # Get the file information (this will fail if no file) 

452 records = self.getStoredItemsInfo(ref) 

453 

454 # Use the path to determine the location -- we need to take 

455 # into account absolute URIs in the datastore record 

456 return [(r.file_location(self.locationFactory), r) for r in records] 

457 

458 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

459 """Check that there is only one dataset associated with the 

460 specified artifact. 

461 

462 Parameters 

463 ---------- 

464 ref : `DatasetRef` or `FakeDatasetRef` 

465 Dataset to be removed. 

466 location : `Location` 

467 The location of the artifact to be removed. 

468 

469 Returns 

470 ------- 

471 can_remove : `Bool` 

472 True if the artifact can be safely removed. 

473 """ 

474 # Can't ever delete absolute URIs. 

475 if location.pathInStore.isabs(): 

476 return False 

477 

478 # Get all entries associated with this path 

479 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

480 if not allRefs: 

481 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

482 

483 # Remove these refs from all the refs and if there is nothing left 

484 # then we can delete 

485 remainingRefs = allRefs - {ref.id} 

486 

487 if remainingRefs: 

488 return False 

489 return True 

490 

491 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

492 StoredFileInfo]]: 

493 """Predict the location and related file information of the requested 

494 dataset in this datastore. 

495 

496 Parameters 

497 ---------- 

498 ref : `DatasetRef` 

499 Reference to the required `Dataset`. 

500 

501 Returns 

502 ------- 

503 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

504 Expected Location of the dataset within the datastore and 

505 placeholder information about each file and its formatter. 

506 

507 Notes 

508 ----- 

509 Uses the current configuration to determine how we would expect the 

510 datastore files to have been written if we couldn't ask registry. 

511 This is safe so long as there has been no change to datastore 

512 configuration between writing the dataset and wanting to read it. 

513 Will not work for files that have been ingested without using the 

514 standard file template or default formatter. 

515 """ 

516 

517 # If we have a component ref we always need to ask the questions 

518 # of the composite. If the composite is disassembled this routine 

519 # should return all components. If the composite was not 

520 # disassembled the composite is what is stored regardless of 

521 # component request. Note that if the caller has disassembled 

522 # a composite there is no way for this guess to know that 

523 # without trying both the composite and component ref and seeing 

524 # if there is something at the component Location even without 

525 # disassembly being enabled. 

526 if ref.datasetType.isComponent(): 

527 ref = ref.makeCompositeRef() 

528 

529 # See if the ref is a composite that should be disassembled 

530 doDisassembly = self.composites.shouldBeDisassembled(ref) 

531 

532 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

533 

534 if doDisassembly: 

535 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

536 compRef = ref.makeComponentRef(component) 

537 location, formatter = self._determine_put_formatter_location(compRef) 

538 all_info.append((location, formatter, componentStorage, component)) 

539 

540 else: 

541 # Always use the composite ref if no disassembly 

542 location, formatter = self._determine_put_formatter_location(ref) 

543 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

544 

545 # Convert the list of tuples to have StoredFileInfo as second element 

546 return [(location, StoredFileInfo(formatter=formatter, 

547 path=location.pathInStore.path, 

548 storageClass=storageClass, 

549 component=component, 

550 checksum=None, 

551 file_size=-1)) 

552 for location, formatter, storageClass, component in all_info] 

553 

554 def _prepare_for_get(self, ref: DatasetRef, 

555 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

556 """Check parameters for ``get`` and obtain formatter and 

557 location. 

558 

559 Parameters 

560 ---------- 

561 ref : `DatasetRef` 

562 Reference to the required Dataset. 

563 parameters : `dict` 

564 `StorageClass`-specific parameters that specify, for example, 

565 a slice of the dataset to be loaded. 

566 

567 Returns 

568 ------- 

569 getInfo : `list` [`DatastoreFileGetInformation`] 

570 Parameters needed to retrieve each file. 

571 """ 

572 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

573 

574 # Get file metadata and internal metadata 

575 fileLocations = self._get_dataset_locations_info(ref) 

576 if not fileLocations: 

577 if not self.trustGetRequest: 

578 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

579 # Assume the dataset is where we think it should be 

580 fileLocations = self._get_expected_dataset_locations_info(ref) 

581 

582 # The storage class we want to use eventually 

583 refStorageClass = ref.datasetType.storageClass 

584 

585 if len(fileLocations) > 1: 

586 disassembled = True 

587 

588 # If trust is involved it is possible that there will be 

589 # components listed here that do not exist in the datastore. 

590 # Explicitly check for file artifact existence and filter out any 

591 # that are missing. 

592 if self.trustGetRequest: 

593 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

594 

595 # For now complain only if we have no components at all. One 

596 # component is probably a problem but we can punt that to the 

597 # assembler. 

598 if not fileLocations: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true

599 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

600 

601 else: 

602 disassembled = False 

603 

604 # Is this a component request? 

605 refComponent = ref.datasetType.component() 

606 

607 fileGetInfo = [] 

608 for location, storedFileInfo in fileLocations: 

609 

610 # The storage class used to write the file 

611 writeStorageClass = storedFileInfo.storageClass 

612 

613 # If this has been disassembled we need read to match the write 

614 if disassembled: 

615 readStorageClass = writeStorageClass 

616 else: 

617 readStorageClass = refStorageClass 

618 

619 formatter = getInstanceOf(storedFileInfo.formatter, 

620 FileDescriptor(location, readStorageClass=readStorageClass, 

621 storageClass=writeStorageClass, parameters=parameters), 

622 ref.dataId) 

623 

624 formatterParams, notFormatterParams = formatter.segregateParameters() 

625 

626 # Of the remaining parameters, extract the ones supported by 

627 # this StorageClass (for components not all will be handled) 

628 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

629 

630 # The ref itself could be a component if the dataset was 

631 # disassembled by butler, or we disassembled in datastore and 

632 # components came from the datastore records 

633 component = storedFileInfo.component if storedFileInfo.component else refComponent 

634 

635 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

636 assemblerParams, formatterParams, 

637 component, readStorageClass)) 

638 

639 return fileGetInfo 

640 

641 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

642 """Check the arguments for ``put`` and obtain formatter and 

643 location. 

644 

645 Parameters 

646 ---------- 

647 inMemoryDataset : `object` 

648 The dataset to store. 

649 ref : `DatasetRef` 

650 Reference to the associated Dataset. 

651 

652 Returns 

653 ------- 

654 location : `Location` 

655 The location to write the dataset. 

656 formatter : `Formatter` 

657 The `Formatter` to use to write the dataset. 

658 

659 Raises 

660 ------ 

661 TypeError 

662 Supplied object and storage class are inconsistent. 

663 DatasetTypeNotSupportedError 

664 The associated `DatasetType` is not handled by this datastore. 

665 """ 

666 self._validate_put_parameters(inMemoryDataset, ref) 

667 return self._determine_put_formatter_location(ref) 

668 

669 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

670 """Calculate the formatter and output location to use for put. 

671 

672 Parameters 

673 ---------- 

674 ref : `DatasetRef` 

675 Reference to the associated Dataset. 

676 

677 Returns 

678 ------- 

679 location : `Location` 

680 The location to write the dataset. 

681 formatter : `Formatter` 

682 The `Formatter` to use to write the dataset. 

683 """ 

684 # Work out output file name 

685 try: 

686 template = self.templates.getTemplate(ref) 

687 except KeyError as e: 

688 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

689 

690 # Validate the template to protect against filenames from different 

691 # dataIds returning the same and causing overwrite confusion. 

692 template.validateTemplate(ref) 

693 

694 location = self.locationFactory.fromPath(template.format(ref)) 

695 

696 # Get the formatter based on the storage class 

697 storageClass = ref.datasetType.storageClass 

698 try: 

699 formatter = self.formatterFactory.getFormatter(ref, 

700 FileDescriptor(location, 

701 storageClass=storageClass), 

702 ref.dataId) 

703 except KeyError as e: 

704 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

705 f"{self.name}") from e 

706 

707 # Now that we know the formatter, update the location 

708 location = formatter.makeUpdatedLocation(location) 

709 

710 return location, formatter 

711 

712 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

713 # Docstring inherited from base class 

714 if transfer != "auto": 

715 return transfer 

716 

717 # See if the paths are within the datastore or not 

718 inside = [self._pathInStore(d.path) is not None for d in datasets] 

719 

720 if all(inside): 

721 transfer = None 

722 elif not any(inside): 722 ↛ 731line 722 didn't jump to line 731, because the condition on line 722 was never false

723 # Allow ButlerURI to use its own knowledge 

724 transfer = "auto" 

725 else: 

726 # This can happen when importing from a datastore that 

727 # has had some datasets ingested using "direct" mode. 

728 # Also allow ButlerURI to sort it out but warn about it. 

729 # This can happen if you are importing from a datastore 

730 # that had some direct transfer datasets. 

731 log.warning("Some datasets are inside the datastore and some are outside. Using 'split' " 

732 "transfer mode. This assumes that the files outside the datastore are " 

733 "still accessible to the new butler since they will not be copied into " 

734 "the target datastore.") 

735 transfer = "split" 

736 

737 return transfer 

738 

739 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

740 """Return path relative to datastore root 

741 

742 Parameters 

743 ---------- 

744 path : `str` or `ButlerURI` 

745 Path to dataset. Can be absolute URI. If relative assumed to 

746 be relative to the datastore. Returns path in datastore 

747 or raises an exception if the path it outside. 

748 

749 Returns 

750 ------- 

751 inStore : `str` 

752 Path relative to datastore root. Returns `None` if the file is 

753 outside the root. 

754 """ 

755 # Relative path will always be relative to datastore 

756 pathUri = ButlerURI(path, forceAbsolute=False) 

757 return pathUri.relative_to(self.root) 

758 

759 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

760 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

761 """Standardize the path of a to-be-ingested file. 

762 

763 Parameters 

764 ---------- 

765 path : `str` or `ButlerURI` 

766 Path of a file to be ingested. 

767 transfer : `str`, optional 

768 How (and whether) the dataset should be added to the datastore. 

769 See `ingest` for details of transfer modes. 

770 This implementation is provided only so 

771 `NotImplementedError` can be raised if the mode is not supported; 

772 actual transfers are deferred to `_extractIngestInfo`. 

773 

774 Returns 

775 ------- 

776 path : `str` or `ButlerURI` 

777 New path in what the datastore considers standard form. If an 

778 absolute URI was given that will be returned unchanged. 

779 

780 Notes 

781 ----- 

782 Subclasses of `FileDatastore` can implement this method instead 

783 of `_prepIngest`. It should not modify the data repository or given 

784 file in any way. 

785 

786 Raises 

787 ------ 

788 NotImplementedError 

789 Raised if the datastore does not support the given transfer mode 

790 (including the case where ingest is not supported at all). 

791 FileNotFoundError 

792 Raised if one of the given files does not exist. 

793 """ 

794 if transfer not in (None, "direct", "split") + self.root.transferModes: 794 ↛ 795line 794 didn't jump to line 795, because the condition on line 794 was never true

795 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

796 

797 # A relative URI indicates relative to datastore root 

798 srcUri = ButlerURI(path, forceAbsolute=False) 

799 if not srcUri.isabs(): 

800 srcUri = self.root.join(path) 

801 

802 if not srcUri.exists(): 

803 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

804 f"are assumed to be relative to {self.root} unless they are absolute.") 

805 

806 if transfer is None: 

807 relpath = srcUri.relative_to(self.root) 

808 if not relpath: 

809 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

810 f"within datastore ({self.root})") 

811 

812 # Return the relative path within the datastore for internal 

813 # transfer 

814 path = relpath 

815 

816 return path 

817 

818 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

819 formatter: Union[Formatter, Type[Formatter]], 

820 transfer: Optional[str] = None) -> StoredFileInfo: 

821 """Relocate (if necessary) and extract `StoredFileInfo` from a 

822 to-be-ingested file. 

823 

824 Parameters 

825 ---------- 

826 path : `str` or `ButlerURI` 

827 URI or path of a file to be ingested. 

828 ref : `DatasetRef` 

829 Reference for the dataset being ingested. Guaranteed to have 

830 ``dataset_id not None`. 

831 formatter : `type` or `Formatter` 

832 `Formatter` subclass to use for this dataset or an instance. 

833 transfer : `str`, optional 

834 How (and whether) the dataset should be added to the datastore. 

835 See `ingest` for details of transfer modes. 

836 

837 Returns 

838 ------- 

839 info : `StoredFileInfo` 

840 Internal datastore record for this file. This will be inserted by 

841 the caller; the `_extractIngestInfo` is only resposible for 

842 creating and populating the struct. 

843 

844 Raises 

845 ------ 

846 FileNotFoundError 

847 Raised if one of the given files does not exist. 

848 FileExistsError 

849 Raised if transfer is not `None` but the (internal) location the 

850 file would be moved to is already occupied. 

851 """ 

852 if self._transaction is None: 852 ↛ 853line 852 didn't jump to line 853, because the condition on line 852 was never true

853 raise RuntimeError("Ingest called without transaction enabled") 

854 

855 # Create URI of the source path, do not need to force a relative 

856 # path to absolute. 

857 srcUri = ButlerURI(path, forceAbsolute=False) 

858 

859 # Track whether we have read the size of the source yet 

860 have_sized = False 

861 

862 tgtLocation: Optional[Location] 

863 if transfer is None or transfer == "split": 

864 # A relative path is assumed to be relative to the datastore 

865 # in this context 

866 if not srcUri.isabs(): 

867 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

868 else: 

869 # Work out the path in the datastore from an absolute URI 

870 # This is required to be within the datastore. 

871 pathInStore = srcUri.relative_to(self.root) 

872 if pathInStore is None and transfer is None: 872 ↛ 873line 872 didn't jump to line 873, because the condition on line 872 was never true

873 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

874 f"not within datastore {self.root}") 

875 if pathInStore: 875 ↛ 877line 875 didn't jump to line 877, because the condition on line 875 was never false

876 tgtLocation = self.locationFactory.fromPath(pathInStore) 

877 elif transfer == "split": 

878 # Outside the datastore but treat that as a direct ingest 

879 # instead. 

880 tgtLocation = None 

881 else: 

882 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for" 

883 f" URI {srcUri}") 

884 elif transfer == "direct": 884 ↛ 889line 884 didn't jump to line 889, because the condition on line 884 was never true

885 # Want to store the full URI to the resource directly in 

886 # datastore. This is useful for referring to permanent archive 

887 # storage for raw data. 

888 # Trust that people know what they are doing. 

889 tgtLocation = None 

890 else: 

891 # Work out the name we want this ingested file to have 

892 # inside the datastore 

893 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

894 if not tgtLocation.uri.dirname().exists(): 

895 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

896 tgtLocation.uri.dirname().mkdir() 

897 

898 # if we are transferring from a local file to a remote location 

899 # it may be more efficient to get the size and checksum of the 

900 # local file rather than the transferred one 

901 if not srcUri.scheme or srcUri.scheme == "file": 901 ↛ 907line 901 didn't jump to line 907, because the condition on line 901 was never false

902 size = srcUri.size() 

903 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

904 have_sized = True 

905 

906 # transfer the resource to the destination 

907 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

908 

909 if tgtLocation is None: 909 ↛ 911line 909 didn't jump to line 911, because the condition on line 909 was never true

910 # This means we are using direct mode 

911 targetUri = srcUri 

912 targetPath = str(srcUri) 

913 else: 

914 targetUri = tgtLocation.uri 

915 targetPath = tgtLocation.pathInStore.path 

916 

917 # the file should exist in the datastore now 

918 if not have_sized: 

919 size = targetUri.size() 

920 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

921 

922 return StoredFileInfo(formatter=formatter, path=targetPath, 

923 storageClass=ref.datasetType.storageClass, 

924 component=ref.datasetType.component(), 

925 file_size=size, checksum=checksum) 

926 

927 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

928 # Docstring inherited from Datastore._prepIngest. 

929 filtered = [] 

930 for dataset in datasets: 

931 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

932 if not acceptable: 

933 continue 

934 else: 

935 dataset.refs = acceptable 

936 if dataset.formatter is None: 

937 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

938 else: 

939 assert isinstance(dataset.formatter, (type, str)) 

940 dataset.formatter = getClassOf(dataset.formatter) 

941 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

942 filtered.append(dataset) 

943 return _IngestPrepData(filtered) 

944 

945 @transactional 

946 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

947 # Docstring inherited from Datastore._finishIngest. 

948 refsAndInfos = [] 

949 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

950 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

951 # Do ingest as if the first dataset ref is associated with the file 

952 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

953 transfer=transfer) 

954 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

955 self._register_datasets(refsAndInfos) 

956 

957 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

958 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

959 """Given a source URI and a DatasetRef, determine the name the 

960 dataset will have inside datastore. 

961 

962 Parameters 

963 ---------- 

964 srcUri : `ButlerURI` 

965 URI to the source dataset file. 

966 ref : `DatasetRef` 

967 Ref associated with the newly-ingested dataset artifact. This 

968 is used to determine the name within the datastore. 

969 formatter : `Formatter` or Formatter class. 

970 Formatter to use for validation. Can be a class or an instance. 

971 

972 Returns 

973 ------- 

974 location : `Location` 

975 Target location for the newly-ingested dataset. 

976 """ 

977 # Ingesting a file from outside the datastore. 

978 # This involves a new name. 

979 template = self.templates.getTemplate(ref) 

980 location = self.locationFactory.fromPath(template.format(ref)) 

981 

982 # Get the extension 

983 ext = srcUri.getExtension() 

984 

985 # Update the destination to include that extension 

986 location.updateExtension(ext) 

987 

988 # Ask the formatter to validate this extension 

989 formatter.validateExtension(location) 

990 

991 return location 

992 

993 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

994 """Write out in memory dataset to datastore. 

995 

996 Parameters 

997 ---------- 

998 inMemoryDataset : `object` 

999 Dataset to write to datastore. 

1000 ref : `DatasetRef` 

1001 Registry information associated with this dataset. 

1002 

1003 Returns 

1004 ------- 

1005 info : `StoredFileInfo` 

1006 Information describin the artifact written to the datastore. 

1007 """ 

1008 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1009 uri = location.uri 

1010 

1011 if not uri.dirname().exists(): 

1012 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1013 uri.dirname().mkdir() 

1014 

1015 if self._transaction is None: 1015 ↛ 1016line 1015 didn't jump to line 1016, because the condition on line 1015 was never true

1016 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1017 

1018 def _removeFileExists(uri: ButlerURI) -> None: 

1019 """Remove a file and do not complain if it is not there. 

1020 

1021 This is important since a formatter might fail before the file 

1022 is written and we should not confuse people by writing spurious 

1023 error messages to the log. 

1024 """ 

1025 try: 

1026 uri.remove() 

1027 except FileNotFoundError: 

1028 pass 

1029 

1030 # Register a callback to try to delete the uploaded data if 

1031 # something fails below 

1032 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1033 

1034 # For a local file, simply use the formatter directly 

1035 if uri.isLocal: 

1036 try: 

1037 formatter.write(inMemoryDataset) 

1038 except Exception as e: 

1039 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} " 

1040 f"to location {uri}") from e 

1041 log.debug("Successfully wrote python object to local file at %s", uri) 

1042 else: 

1043 # This is a remote URI, so first try bytes and write directly else 

1044 # fallback to a temporary file 

1045 try: 

1046 serializedDataset = formatter.toBytes(inMemoryDataset) 

1047 except NotImplementedError: 1047 ↛ 1066line 1047 didn't jump to line 1066

1048 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

1049 # Need to configure the formatter to write to a different 

1050 # location and that needs us to overwrite internals 

1051 tmpLocation = Location(*os.path.split(tmpFile.name)) 

1052 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

1053 with formatter._updateLocation(tmpLocation): 

1054 try: 

1055 formatter.write(inMemoryDataset) 

1056 except Exception as e: 

1057 raise RuntimeError(f"Failed to serialize dataset {ref} of type" 

1058 f" {type(inMemoryDataset)} to " 

1059 f"temporary location {tmpLocation.uri}") from e 

1060 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

1061 

1062 # Cache if required 

1063 self.cacheManager.move_to_cache(tmpLocation.uri, ref) 

1064 

1065 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1066 except Exception as e: 

1067 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e 

1068 else: 

1069 log.debug("Writing bytes directly to %s", uri) 

1070 uri.write(serializedDataset, overwrite=True) 

1071 log.debug("Successfully wrote bytes directly to %s", uri) 

1072 

1073 # URI is needed to resolve what ingest case are we dealing with 

1074 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1075 

1076 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1077 ref: DatasetRef, isComponent: bool = False) -> Any: 

1078 """Read the artifact from datastore into in memory object. 

1079 

1080 Parameters 

1081 ---------- 

1082 getInfo : `DatastoreFileGetInformation` 

1083 Information about the artifact within the datastore. 

1084 ref : `DatasetRef` 

1085 The registry information associated with this artifact. 

1086 isComponent : `bool` 

1087 Flag to indicate if a component is being read from this artifact. 

1088 

1089 Returns 

1090 ------- 

1091 inMemoryDataset : `object` 

1092 The artifact as a python object. 

1093 """ 

1094 location = getInfo.location 

1095 uri = location.uri 

1096 log.debug("Accessing data from %s", uri) 

1097 

1098 # Cannot recalculate checksum but can compare size as a quick check 

1099 # Do not do this if the size is negative since that indicates 

1100 # we do not know. 

1101 recorded_size = getInfo.info.file_size 

1102 resource_size = uri.size() 

1103 if recorded_size >= 0 and resource_size != recorded_size: 1103 ↛ 1104line 1103 didn't jump to line 1104, because the condition on line 1103 was never true

1104 raise RuntimeError("Integrity failure in Datastore. " 

1105 f"Size of file {uri} ({resource_size}) " 

1106 f"does not match size recorded in registry of {recorded_size}") 

1107 

1108 # For the general case we have choices for how to proceed. 

1109 # 1. Always use a local file (downloading the remote resource to a 

1110 # temporary file if needed). 

1111 # 2. Use a threshold size and read into memory and use bytes. 

1112 # Use both for now with an arbitrary hand off size. 

1113 # This allows small datasets to be downloaded from remote object 

1114 # stores without requiring a temporary file. 

1115 

1116 formatter = getInfo.formatter 

1117 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1118 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1119 with time_this(log, msg="Reading bytes from %s", args=(uri,)): 

1120 serializedDataset = uri.read() 

1121 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1122 f"component {getInfo.component}" if isComponent else "", 

1123 len(serializedDataset), uri, formatter.name()) 

1124 try: 

1125 result = formatter.fromBytes(serializedDataset, 

1126 component=getInfo.component if isComponent else None) 

1127 except Exception as e: 

1128 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1129 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1130 else: 

1131 # Read from file. 

1132 

1133 # Have to update the Location associated with the formatter 

1134 # because formatter.read does not allow an override. 

1135 # This could be improved. 

1136 location_updated = False 

1137 msg = "" 

1138 

1139 # First check in cache for local version. 

1140 # The cache will only be relevant for remote resources. 

1141 if not uri.isLocal: 

1142 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension()) 

1143 if cached_file is not None: 1143 ↛ 1144line 1143 didn't jump to line 1144, because the condition on line 1143 was never true

1144 msg = f"(via cache read of remote file {uri})" 

1145 uri = cached_file 

1146 location_updated = True 

1147 

1148 with uri.as_local() as local_uri: 

1149 

1150 # URI was remote and file was downloaded 

1151 if uri != local_uri: 

1152 cache_msg = "" 

1153 location_updated = True 

1154 

1155 # Cache the downloaded file if needed. 

1156 cached_uri = self.cacheManager.move_to_cache(local_uri, ref) 

1157 if cached_uri is not None: 1157 ↛ 1158line 1157 didn't jump to line 1158, because the condition on line 1157 was never true

1158 local_uri = cached_uri 

1159 cache_msg = " and cached" 

1160 

1161 msg = f"(via download to local file{cache_msg})" 

1162 

1163 # Calculate the (possibly) new location for the formatter 

1164 # to use. 

1165 newLocation = Location(*local_uri.split()) if location_updated else None 

1166 

1167 log.debug("Reading%s from location %s %s with formatter %s", 

1168 f" component {getInfo.component}" if isComponent else "", 

1169 uri, msg, formatter.name()) 

1170 try: 

1171 with formatter._updateLocation(newLocation): 

1172 with time_this(log, msg="Reading%s from location %s %s with formatter %s", 

1173 args=(f" component {getInfo.component}" if isComponent else "", 

1174 uri, msg, formatter.name())): 

1175 result = formatter.read(component=getInfo.component if isComponent else None) 

1176 except Exception as e: 

1177 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1178 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1179 

1180 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1181 isComponent=isComponent) 

1182 

1183 def knows(self, ref: DatasetRef) -> bool: 

1184 """Check if the dataset is known to the datastore. 

1185 

1186 Does not check for existence of any artifact. 

1187 

1188 Parameters 

1189 ---------- 

1190 ref : `DatasetRef` 

1191 Reference to the required dataset. 

1192 

1193 Returns 

1194 ------- 

1195 exists : `bool` 

1196 `True` if the dataset is known to the datastore. 

1197 """ 

1198 fileLocations = self._get_dataset_locations_info(ref) 

1199 if fileLocations: 

1200 return True 

1201 return False 

1202 

1203 def exists(self, ref: DatasetRef) -> bool: 

1204 """Check if the dataset exists in the datastore. 

1205 

1206 Parameters 

1207 ---------- 

1208 ref : `DatasetRef` 

1209 Reference to the required dataset. 

1210 

1211 Returns 

1212 ------- 

1213 exists : `bool` 

1214 `True` if the entity exists in the `Datastore`. 

1215 """ 

1216 fileLocations = self._get_dataset_locations_info(ref) 

1217 

1218 # if we are being asked to trust that registry might not be correct 

1219 # we ask for the expected locations and check them explicitly 

1220 if not fileLocations: 

1221 if not self.trustGetRequest: 

1222 return False 

1223 

1224 # When we are guessing a dataset location we can not check 

1225 # for the existence of every component since we can not 

1226 # know if every component was written. Instead we check 

1227 # for the existence of any of the expected locations. 

1228 for location, _ in self._get_expected_dataset_locations_info(ref): 1228 ↛ 1231line 1228 didn't jump to line 1231, because the loop on line 1228 didn't complete

1229 if self._artifact_exists(location): 1229 ↛ 1228line 1229 didn't jump to line 1228, because the condition on line 1229 was never false

1230 return True 

1231 return False 

1232 

1233 # All listed artifacts must exist. 

1234 for location, _ in fileLocations: 

1235 if not self._artifact_exists(location): 

1236 return False 

1237 

1238 return True 

1239 

1240 def getURIs(self, ref: DatasetRef, 

1241 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1242 """Return URIs associated with dataset. 

1243 

1244 Parameters 

1245 ---------- 

1246 ref : `DatasetRef` 

1247 Reference to the required dataset. 

1248 predict : `bool`, optional 

1249 If the datastore does not know about the dataset, should it 

1250 return a predicted URI or not? 

1251 

1252 Returns 

1253 ------- 

1254 primary : `ButlerURI` 

1255 The URI to the primary artifact associated with this dataset. 

1256 If the dataset was disassembled within the datastore this 

1257 may be `None`. 

1258 components : `dict` 

1259 URIs to any components associated with the dataset artifact. 

1260 Can be empty if there are no components. 

1261 """ 

1262 

1263 primary: Optional[ButlerURI] = None 

1264 components: Dict[str, ButlerURI] = {} 

1265 

1266 # if this has never been written then we have to guess 

1267 if not self.exists(ref): 

1268 if not predict: 

1269 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1270 

1271 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1272 

1273 if doDisassembly: 

1274 

1275 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1276 compRef = ref.makeComponentRef(component) 

1277 compLocation, _ = self._determine_put_formatter_location(compRef) 

1278 

1279 # Add a URI fragment to indicate this is a guess 

1280 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1281 

1282 else: 

1283 

1284 location, _ = self._determine_put_formatter_location(ref) 

1285 

1286 # Add a URI fragment to indicate this is a guess 

1287 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1288 

1289 return primary, components 

1290 

1291 # If this is a ref that we have written we can get the path. 

1292 # Get file metadata and internal metadata 

1293 fileLocations = self._get_dataset_locations_info(ref) 

1294 

1295 guessing = False 

1296 if not fileLocations: 

1297 if not self.trustGetRequest: 1297 ↛ 1298line 1297 didn't jump to line 1298, because the condition on line 1297 was never true

1298 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1299 fileLocations = self._get_expected_dataset_locations_info(ref) 

1300 guessing = True 

1301 

1302 if len(fileLocations) == 1: 

1303 # No disassembly so this is the primary URI 

1304 uri = fileLocations[0][0].uri 

1305 if guessing and not uri.exists(): 1305 ↛ 1306line 1305 didn't jump to line 1306, because the condition on line 1305 was never true

1306 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1307 primary = uri 

1308 

1309 else: 

1310 for location, storedFileInfo in fileLocations: 

1311 if storedFileInfo.component is None: 1311 ↛ 1312line 1311 didn't jump to line 1312, because the condition on line 1311 was never true

1312 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1313 uri = location.uri 

1314 if guessing and not uri.exists(): 1314 ↛ 1318line 1314 didn't jump to line 1318, because the condition on line 1314 was never true

1315 # If we are trusting then it is entirely possible for 

1316 # some components to be missing. In that case we skip 

1317 # to the next component. 

1318 if self.trustGetRequest: 

1319 continue 

1320 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1321 components[storedFileInfo.component] = uri 

1322 

1323 return primary, components 

1324 

1325 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1326 """URI to the Dataset. 

1327 

1328 Parameters 

1329 ---------- 

1330 ref : `DatasetRef` 

1331 Reference to the required Dataset. 

1332 predict : `bool` 

1333 If `True`, allow URIs to be returned of datasets that have not 

1334 been written. 

1335 

1336 Returns 

1337 ------- 

1338 uri : `str` 

1339 URI pointing to the dataset within the datastore. If the 

1340 dataset does not exist in the datastore, and if ``predict`` is 

1341 `True`, the URI will be a prediction and will include a URI 

1342 fragment "#predicted". 

1343 If the datastore does not have entities that relate well 

1344 to the concept of a URI the returned URI will be 

1345 descriptive. The returned URI is not guaranteed to be obtainable. 

1346 

1347 Raises 

1348 ------ 

1349 FileNotFoundError 

1350 Raised if a URI has been requested for a dataset that does not 

1351 exist and guessing is not allowed. 

1352 RuntimeError 

1353 Raised if a request is made for a single URI but multiple URIs 

1354 are associated with this dataset. 

1355 

1356 Notes 

1357 ----- 

1358 When a predicted URI is requested an attempt will be made to form 

1359 a reasonable URI based on file templates and the expected formatter. 

1360 """ 

1361 primary, components = self.getURIs(ref, predict) 

1362 if primary is None or components: 1362 ↛ 1363line 1362 didn't jump to line 1363, because the condition on line 1362 was never true

1363 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1364 "Use Dataastore.getURIs() instead.") 

1365 return primary 

1366 

1367 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1368 destination: ButlerURI, transfer: str = "auto", 

1369 preserve_path: bool = True, 

1370 overwrite: bool = False) -> List[ButlerURI]: 

1371 """Retrieve the file artifacts associated with the supplied refs. 

1372 

1373 Parameters 

1374 ---------- 

1375 refs : iterable of `DatasetRef` 

1376 The datasets for which file artifacts are to be retrieved. 

1377 A single ref can result in multiple files. The refs must 

1378 be resolved. 

1379 destination : `ButlerURI` 

1380 Location to write the file artifacts. 

1381 transfer : `str`, optional 

1382 Method to use to transfer the artifacts. Must be one of the options 

1383 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1384 preserve_path : `bool`, optional 

1385 If `True` the full path of the file artifact within the datastore 

1386 is preserved. If `False` the final file component of the path 

1387 is used. 

1388 overwrite : `bool`, optional 

1389 If `True` allow transfers to overwrite existing files at the 

1390 destination. 

1391 

1392 Returns 

1393 ------- 

1394 targets : `list` of `ButlerURI` 

1395 URIs of file artifacts in destination location. Order is not 

1396 preserved. 

1397 """ 

1398 if not destination.isdir(): 1398 ↛ 1399line 1398 didn't jump to line 1399, because the condition on line 1398 was never true

1399 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1400 

1401 if transfer == "move": 

1402 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1403 

1404 # Source -> Destination 

1405 # This also helps filter out duplicate DatasetRef in the request 

1406 # that will map to the same underlying file transfer. 

1407 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1408 

1409 for ref in refs: 

1410 locations = self._get_dataset_locations_info(ref) 

1411 for location, _ in locations: 

1412 source_uri = location.uri 

1413 target_path: Union[str, ButlerURI] 

1414 if preserve_path: 

1415 target_path = location.pathInStore 

1416 if target_path.isabs(): 1416 ↛ 1419line 1416 didn't jump to line 1419, because the condition on line 1416 was never true

1417 # This is an absolute path to an external file. 

1418 # Use the full path. 

1419 target_path = target_path.relativeToPathRoot 

1420 else: 

1421 target_path = source_uri.basename() 

1422 target_uri = destination.join(target_path) 

1423 to_transfer[source_uri] = target_uri 

1424 

1425 # In theory can now parallelize the transfer 

1426 log.debug("Number of artifacts to transfer to %s: %d", 

1427 str(destination), len(to_transfer)) 

1428 for source_uri, target_uri in to_transfer.items(): 

1429 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1430 

1431 return list(to_transfer.values()) 

1432 

1433 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1434 """Load an InMemoryDataset from the store. 

1435 

1436 Parameters 

1437 ---------- 

1438 ref : `DatasetRef` 

1439 Reference to the required Dataset. 

1440 parameters : `dict` 

1441 `StorageClass`-specific parameters that specify, for example, 

1442 a slice of the dataset to be loaded. 

1443 

1444 Returns 

1445 ------- 

1446 inMemoryDataset : `object` 

1447 Requested dataset or slice thereof as an InMemoryDataset. 

1448 

1449 Raises 

1450 ------ 

1451 FileNotFoundError 

1452 Requested dataset can not be retrieved. 

1453 TypeError 

1454 Return value from formatter has unexpected type. 

1455 ValueError 

1456 Formatter failed to process the dataset. 

1457 """ 

1458 allGetInfo = self._prepare_for_get(ref, parameters) 

1459 refComponent = ref.datasetType.component() 

1460 

1461 # Supplied storage class for the component being read 

1462 refStorageClass = ref.datasetType.storageClass 

1463 

1464 # Create mapping from component name to related info 

1465 allComponents = {i.component: i for i in allGetInfo} 

1466 

1467 # By definition the dataset is disassembled if we have more 

1468 # than one record for it. 

1469 isDisassembled = len(allGetInfo) > 1 

1470 

1471 # Look for the special case where we are disassembled but the 

1472 # component is a derived component that was not written during 

1473 # disassembly. For this scenario we need to check that the 

1474 # component requested is listed as a derived component for the 

1475 # composite storage class 

1476 isDisassembledReadOnlyComponent = False 

1477 if isDisassembled and refComponent: 

1478 # The composite storage class should be accessible through 

1479 # the component dataset type 

1480 compositeStorageClass = ref.datasetType.parentStorageClass 

1481 

1482 # In the unlikely scenario where the composite storage 

1483 # class is not known, we can only assume that this is a 

1484 # normal component. If that assumption is wrong then the 

1485 # branch below that reads a persisted component will fail 

1486 # so there is no need to complain here. 

1487 if compositeStorageClass is not None: 1487 ↛ 1490line 1487 didn't jump to line 1490, because the condition on line 1487 was never false

1488 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1489 

1490 if isDisassembled and not refComponent: 

1491 # This was a disassembled dataset spread over multiple files 

1492 # and we need to put them all back together again. 

1493 # Read into memory and then assemble 

1494 

1495 # Check that the supplied parameters are suitable for the type read 

1496 refStorageClass.validateParameters(parameters) 

1497 

1498 # We want to keep track of all the parameters that were not used 

1499 # by formatters. We assume that if any of the component formatters 

1500 # use a parameter that we do not need to apply it again in the 

1501 # assembler. 

1502 usedParams = set() 

1503 

1504 components: Dict[str, Any] = {} 

1505 for getInfo in allGetInfo: 

1506 # assemblerParams are parameters not understood by the 

1507 # associated formatter. 

1508 usedParams.update(set(getInfo.formatterParams)) 

1509 

1510 component = getInfo.component 

1511 

1512 if component is None: 1512 ↛ 1513line 1512 didn't jump to line 1513, because the condition on line 1512 was never true

1513 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1514 

1515 # We do not want the formatter to think it's reading 

1516 # a component though because it is really reading a 

1517 # standalone dataset -- always tell reader it is not a 

1518 # component. 

1519 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1520 

1521 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1522 

1523 # Any unused parameters will have to be passed to the assembler 

1524 if parameters: 

1525 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1526 else: 

1527 unusedParams = {} 

1528 

1529 # Process parameters 

1530 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1531 parameters=unusedParams) 

1532 

1533 elif isDisassembledReadOnlyComponent: 

1534 

1535 compositeStorageClass = ref.datasetType.parentStorageClass 

1536 if compositeStorageClass is None: 1536 ↛ 1537line 1536 didn't jump to line 1537, because the condition on line 1536 was never true

1537 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1538 "no composite storage class is available.") 

1539 

1540 if refComponent is None: 1540 ↛ 1542line 1540 didn't jump to line 1542, because the condition on line 1540 was never true

1541 # Mainly for mypy 

1542 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1543 

1544 # Assume that every derived component can be calculated by 

1545 # forwarding the request to a single read/write component. 

1546 # Rather than guessing which rw component is the right one by 

1547 # scanning each for a derived component of the same name, 

1548 # we ask the storage class delegate directly which one is best to 

1549 # use. 

1550 compositeDelegate = compositeStorageClass.delegate() 

1551 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1552 set(allComponents)) 

1553 

1554 # Select the relevant component 

1555 rwInfo = allComponents[forwardedComponent] 

1556 

1557 # For now assume that read parameters are validated against 

1558 # the real component and not the requested component 

1559 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1560 forwardedStorageClass.validateParameters(parameters) 

1561 

1562 # Unfortunately the FileDescriptor inside the formatter will have 

1563 # the wrong write storage class so we need to create a new one 

1564 # given the immutability constraint. 

1565 writeStorageClass = rwInfo.info.storageClass 

1566 

1567 # We may need to put some thought into parameters for read 

1568 # components but for now forward them on as is 

1569 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1570 readStorageClass=refStorageClass, 

1571 storageClass=writeStorageClass, 

1572 parameters=parameters), 

1573 ref.dataId) 

1574 

1575 # The assembler can not receive any parameter requests for a 

1576 # derived component at this time since the assembler will 

1577 # see the storage class of the derived component and those 

1578 # parameters will have to be handled by the formatter on the 

1579 # forwarded storage class. 

1580 assemblerParams: Dict[str, Any] = {} 

1581 

1582 # Need to created a new info that specifies the derived 

1583 # component and associated storage class 

1584 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1585 rwInfo.info, assemblerParams, {}, 

1586 refComponent, refStorageClass) 

1587 

1588 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1589 

1590 else: 

1591 # Single file request or component from that composite file 

1592 for lookup in (refComponent, None): 1592 ↛ 1597line 1592 didn't jump to line 1597, because the loop on line 1592 didn't complete

1593 if lookup in allComponents: 1593 ↛ 1592line 1593 didn't jump to line 1592, because the condition on line 1593 was never false

1594 getInfo = allComponents[lookup] 

1595 break 

1596 else: 

1597 raise FileNotFoundError(f"Component {refComponent} not found " 

1598 f"for ref {ref} in datastore {self.name}") 

1599 

1600 # Do not need the component itself if already disassembled 

1601 if isDisassembled: 

1602 isComponent = False 

1603 else: 

1604 isComponent = getInfo.component is not None 

1605 

1606 # For a disassembled component we can validate parametersagainst 

1607 # the component storage class directly 

1608 if isDisassembled: 

1609 refStorageClass.validateParameters(parameters) 

1610 else: 

1611 # For an assembled composite this could be a derived 

1612 # component derived from a real component. The validity 

1613 # of the parameters is not clear. For now validate against 

1614 # the composite storage class 

1615 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1616 

1617 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1618 

1619 @transactional 

1620 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1621 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1622 

1623 Parameters 

1624 ---------- 

1625 inMemoryDataset : `object` 

1626 The dataset to store. 

1627 ref : `DatasetRef` 

1628 Reference to the associated Dataset. 

1629 

1630 Raises 

1631 ------ 

1632 TypeError 

1633 Supplied object and storage class are inconsistent. 

1634 DatasetTypeNotSupportedError 

1635 The associated `DatasetType` is not handled by this datastore. 

1636 

1637 Notes 

1638 ----- 

1639 If the datastore is configured to reject certain dataset types it 

1640 is possible that the put will fail and raise a 

1641 `DatasetTypeNotSupportedError`. The main use case for this is to 

1642 allow `ChainedDatastore` to put to multiple datastores without 

1643 requiring that every datastore accepts the dataset. 

1644 """ 

1645 

1646 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1647 # doDisassembly = True 

1648 

1649 artifacts = [] 

1650 if doDisassembly: 

1651 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1652 for component, componentInfo in components.items(): 

1653 # Don't recurse because we want to take advantage of 

1654 # bulk insert -- need a new DatasetRef that refers to the 

1655 # same dataset_id but has the component DatasetType 

1656 # DatasetType does not refer to the types of components 

1657 # So we construct one ourselves. 

1658 compRef = ref.makeComponentRef(component) 

1659 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1660 artifacts.append((compRef, storedInfo)) 

1661 else: 

1662 # Write the entire thing out 

1663 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1664 artifacts.append((ref, storedInfo)) 

1665 

1666 self._register_datasets(artifacts) 

1667 

1668 @transactional 

1669 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

1670 # Get file metadata and internal metadata 

1671 if not isinstance(ref, DatasetRef): 

1672 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

1673 # Assumed to be an iterable of refs so bulk mode enabled. 

1674 try: 

1675 self.bridge.moveToTrash(ref) 

1676 except Exception as e: 

1677 if ignore_errors: 

1678 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

1679 else: 

1680 raise 

1681 return 

1682 

1683 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

1684 

1685 fileLocations = self._get_dataset_locations_info(ref) 

1686 

1687 if not fileLocations: 

1688 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1689 if ignore_errors: 1689 ↛ 1690line 1689 didn't jump to line 1690, because the condition on line 1689 was never true

1690 log.warning(err_msg) 

1691 return 

1692 else: 

1693 raise FileNotFoundError(err_msg) 

1694 

1695 for location, storedFileInfo in fileLocations: 

1696 if not self._artifact_exists(location): 1696 ↛ 1697line 1696 didn't jump to line 1697, because the condition on line 1696 was never true

1697 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1698 f"associated artifact ({location.uri}) is missing" 

1699 if ignore_errors: 

1700 log.warning(err_msg) 

1701 return 

1702 else: 

1703 raise FileNotFoundError(err_msg) 

1704 

1705 # Mark dataset as trashed 

1706 try: 

1707 self.bridge.moveToTrash([ref]) 

1708 except Exception as e: 

1709 if ignore_errors: 

1710 log.warning("Attempted to mark dataset (%s) to be trashed in datastore %s " 

1711 "but encountered an error: %s", ref, self.name, e) 

1712 pass 

1713 else: 

1714 raise 

1715 

1716 @transactional 

1717 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1718 """Remove all datasets from the trash. 

1719 

1720 Parameters 

1721 ---------- 

1722 ignore_errors : `bool` 

1723 If `True` return without error even if something went wrong. 

1724 Problems could occur if another process is simultaneously trying 

1725 to delete. 

1726 """ 

1727 log.debug("Emptying trash in datastore %s", self.name) 

1728 

1729 # Context manager will empty trash iff we finish it without raising. 

1730 # It will also automatically delete the relevant rows from the 

1731 # trash table and the records table. 

1732 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo, 

1733 record_column="path") as trash_data: 

1734 # Removing the artifacts themselves requires that the files are 

1735 # not also associated with refs that are not to be trashed. 

1736 # Therefore need to do a query with the file paths themselves 

1737 # and return all the refs associated with them. Can only delete 

1738 # a file if the refs to be trashed are the only refs associated 

1739 # with the file. 

1740 # This requires multiple copies of the trashed items 

1741 trashed, artifacts_to_keep = trash_data 

1742 

1743 if artifacts_to_keep is None: 

1744 # The bridge is not helping us so have to work it out 

1745 # ourselves. This is not going to be as efficient. 

1746 trashed = list(trashed) 

1747 

1748 # The instance check is for mypy since up to this point it 

1749 # does not know the type of info. 

1750 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed 

1751 if isinstance(info, StoredFileInfo)]) 

1752 

1753 for ref, info in trashed: 

1754 

1755 # Mypy needs to know this is not the base class 

1756 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1757 

1758 # Check for mypy 

1759 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1760 

1761 path_map[info.path].remove(ref.id) 

1762 if not path_map[info.path]: 1762 ↛ 1753line 1762 didn't jump to line 1753, because the condition on line 1762 was never false

1763 del path_map[info.path] 

1764 

1765 artifacts_to_keep = set(path_map) 

1766 

1767 for ref, info in trashed: 

1768 

1769 # Should not happen for this implementation but need 

1770 # to keep mypy happy. 

1771 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

1772 

1773 # Mypy needs to know this is not the base class 

1774 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1775 

1776 # Check for mypy 

1777 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1778 

1779 if info.path in artifacts_to_keep: 

1780 # This is a multi-dataset artifact and we are not 

1781 # removing all associated refs. 

1782 continue 

1783 

1784 # Only trashed refs still known to datastore will be returned. 

1785 location = info.file_location(self.locationFactory) 

1786 

1787 # Point of no return for this artifact 

1788 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1789 try: 

1790 self._delete_artifact(location) 

1791 except FileNotFoundError: 

1792 # If the file itself has been deleted there is nothing 

1793 # we can do about it. It is possible that trash has 

1794 # been run in parallel in another process or someone 

1795 # decided to delete the file. It is unlikely to come 

1796 # back and so we should still continue with the removal 

1797 # of the entry from the trash table. It is also possible 

1798 # we removed it in a previous iteration if it was 

1799 # a multi-dataset artifact. The delete artifact method 

1800 # will log a debug message in this scenario. 

1801 # Distinguishing file missing before trash started and 

1802 # file already removed previously as part of this trash 

1803 # is not worth the distinction with regards to potential 

1804 # memory cost. 

1805 pass 

1806 except Exception as e: 

1807 if ignore_errors: 

1808 # Use a debug message here even though it's not 

1809 # a good situation. In some cases this can be 

1810 # caused by a race between user A and user B 

1811 # and neither of them has permissions for the 

1812 # other's files. Butler does not know about users 

1813 # and trash has no idea what collections these 

1814 # files were in (without guessing from a path). 

1815 log.debug("Encountered error removing artifact %s from datastore %s: %s", 

1816 location.uri, self.name, e) 

1817 else: 

1818 raise 

1819 

1820 @transactional 

1821 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef], 

1822 local_refs: Optional[Iterable[DatasetRef]] = None, 

1823 transfer: str = "auto") -> None: 

1824 # Docstring inherited 

1825 if type(self) is not type(source_datastore): 1825 ↛ 1826line 1825 didn't jump to line 1826, because the condition on line 1825 was never true

1826 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the " 

1827 f"source datastore ({type(source_datastore)}).") 

1828 

1829 # Be explicit for mypy 

1830 if not isinstance(source_datastore, FileDatastore): 1830 ↛ 1831line 1830 didn't jump to line 1831, because the condition on line 1830 was never true

1831 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not" 

1832 f" {type(source_datastore)}") 

1833 

1834 # Stop early if "direct" transfer mode is requested. That would 

1835 # require that the URI inside the source datastore should be stored 

1836 # directly in the target datastore, which seems unlikely to be useful 

1837 # since at any moment the source datastore could delete the file. 

1838 if transfer in ("direct", "split"): 1838 ↛ 1839line 1838 didn't jump to line 1839, because the condition on line 1838 was never true

1839 raise ValueError("Can not transfer from a source datastore using direct mode since" 

1840 " those files are controlled by the other datastore.") 

1841 

1842 # We will go through the list multiple times so must convert 

1843 # generators to lists. 

1844 refs = list(refs) 

1845 

1846 if local_refs is None: 1846 ↛ 1847line 1846 didn't jump to line 1847, because the condition on line 1846 was never true

1847 local_refs = refs 

1848 else: 

1849 local_refs = list(local_refs) 

1850 

1851 # In order to handle disassembled composites the code works 

1852 # at the records level since it can assume that internal APIs 

1853 # can be used. 

1854 # - If the record already exists in the destination this is assumed 

1855 # to be okay. 

1856 # - If there is no record but the source and destination URIs are 

1857 # identical no transfer is done but the record is added. 

1858 # - If the source record refers to an absolute URI currently assume 

1859 # that that URI should remain absolute and will be visible to the 

1860 # destination butler. May need to have a flag to indicate whether 

1861 # the dataset should be transferred. This will only happen if 

1862 # the detached Butler has had a local ingest. 

1863 

1864 # What we really want is all the records in the source datastore 

1865 # associated with these refs. Or derived ones if they don't exist 

1866 # in the source. 

1867 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

1868 

1869 # The source dataset_ids are the keys in these records 

1870 source_ids = set(source_records) 

1871 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

1872 

1873 # The not None check is to appease mypy 

1874 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

1875 missing_ids = requested_ids - source_ids 

1876 

1877 # Missing IDs can be okay if that datastore has allowed 

1878 # gets based on file existence. Should we transfer what we can 

1879 # or complain about it and warn? 

1880 if missing_ids and not source_datastore.trustGetRequest: 1880 ↛ 1881line 1880 didn't jump to line 1881, because the condition on line 1880 was never true

1881 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:" 

1882 f" {missing_ids}") 

1883 

1884 # Need to map these missing IDs to a DatasetRef so we can guess 

1885 # the details. 

1886 if missing_ids: 1886 ↛ 1887line 1886 didn't jump to line 1887, because the condition on line 1886 was never true

1887 log.info("Number of expected datasets missing from source datastore records: %d out of %d", 

1888 len(missing_ids), len(requested_ids)) 

1889 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

1890 

1891 for missing in missing_ids: 

1892 # Ask the source datastore where the missing artifacts 

1893 # should be. An execution butler might not know about the 

1894 # artifacts even if they are there. 

1895 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

1896 

1897 # Not all components can be guaranteed to exist so this 

1898 # list has to filter those by checking to see if the 

1899 # artifact is really there. 

1900 records = [info for location, info in expected if location.uri.exists()] 

1901 if records: 

1902 source_records[missing].extend(records) 

1903 else: 

1904 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", 

1905 id_to_ref[missing]) 

1906 

1907 # See if we already have these records 

1908 target_records = self._get_stored_records_associated_with_refs(local_refs) 

1909 

1910 # The artifacts to register 

1911 artifacts = [] 

1912 

1913 # Refs that already exist 

1914 already_present = [] 

1915 

1916 # Now can transfer the artifacts 

1917 for source_ref, target_ref in zip(refs, local_refs): 

1918 if target_ref.id in target_records: 1918 ↛ 1920line 1918 didn't jump to line 1920, because the condition on line 1918 was never true

1919 # Already have an artifact for this. 

1920 already_present.append(target_ref) 

1921 continue 

1922 

1923 # mypy needs to know these are always resolved refs 

1924 for info in source_records[source_ref.getCheckedId()]: 

1925 source_location = info.file_location(source_datastore.locationFactory) 

1926 target_location = info.file_location(self.locationFactory) 

1927 if source_location == target_location: 1927 ↛ 1931line 1927 didn't jump to line 1931, because the condition on line 1927 was never true

1928 # Either the dataset is already in the target datastore 

1929 # (which is how execution butler currently runs) or 

1930 # it is an absolute URI. 

1931 if source_location.pathInStore.isabs(): 

1932 # Just because we can see the artifact when running 

1933 # the transfer doesn't mean it will be generally 

1934 # accessible to a user of this butler. For now warn 

1935 # but assume it will be accessible. 

1936 log.warning("Transfer request for an outside-datastore artifact has been found at %s", 

1937 source_location) 

1938 else: 

1939 # Need to transfer it to the new location. 

1940 # Assume we should always overwrite. If the artifact 

1941 # is there this might indicate that a previous transfer 

1942 # was interrupted but was not able to be rolled back 

1943 # completely (eg pre-emption) so follow Datastore default 

1944 # and overwrite. 

1945 target_location.uri.transfer_from(source_location.uri, transfer=transfer, 

1946 overwrite=True, transaction=self._transaction) 

1947 

1948 artifacts.append((target_ref, info)) 

1949 

1950 self._register_datasets(artifacts) 

1951 

1952 if already_present: 1952 ↛ 1953line 1952 didn't jump to line 1953, because the condition on line 1952 was never true

1953 n_skipped = len(already_present) 

1954 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped, 

1955 "" if n_skipped == 1 else "s") 

1956 

1957 @transactional 

1958 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1959 # Docstring inherited. 

1960 refs = list(refs) 

1961 self.bridge.forget(refs) 

1962 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1963 

1964 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1965 logFailures: bool = False) -> None: 

1966 """Validate some of the configuration for this datastore. 

1967 

1968 Parameters 

1969 ---------- 

1970 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1971 Entities to test against this configuration. Can be differing 

1972 types. 

1973 logFailures : `bool`, optional 

1974 If `True`, output a log message for every validation error 

1975 detected. 

1976 

1977 Raises 

1978 ------ 

1979 DatastoreValidationError 

1980 Raised if there is a validation problem with a configuration. 

1981 All the problems are reported in a single exception. 

1982 

1983 Notes 

1984 ----- 

1985 This method checks that all the supplied entities have valid file 

1986 templates and also have formatters defined. 

1987 """ 

1988 

1989 templateFailed = None 

1990 try: 

1991 self.templates.validateTemplates(entities, logFailures=logFailures) 

1992 except FileTemplateValidationError as e: 

1993 templateFailed = str(e) 

1994 

1995 formatterFailed = [] 

1996 for entity in entities: 

1997 try: 

1998 self.formatterFactory.getFormatterClass(entity) 

1999 except KeyError as e: 

2000 formatterFailed.append(str(e)) 

2001 if logFailures: 2001 ↛ 1996line 2001 didn't jump to line 1996, because the condition on line 2001 was never false

2002 log.critical("Formatter failure: %s", e) 

2003 

2004 if templateFailed or formatterFailed: 

2005 messages = [] 

2006 if templateFailed: 2006 ↛ 2007line 2006 didn't jump to line 2007, because the condition on line 2006 was never true

2007 messages.append(templateFailed) 

2008 if formatterFailed: 2008 ↛ 2010line 2008 didn't jump to line 2010, because the condition on line 2008 was never false

2009 messages.append(",".join(formatterFailed)) 

2010 msg = ";\n".join(messages) 

2011 raise DatastoreValidationError(msg) 

2012 

2013 def getLookupKeys(self) -> Set[LookupKey]: 

2014 # Docstring is inherited from base class 

2015 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

2016 self.constraints.getLookupKeys() 

2017 

2018 def validateKey(self, lookupKey: LookupKey, 

2019 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2020 # Docstring is inherited from base class 

2021 # The key can be valid in either formatters or templates so we can 

2022 # only check the template if it exists 

2023 if lookupKey in self.templates: 

2024 try: 

2025 self.templates[lookupKey].validateTemplate(entity) 

2026 except FileTemplateValidationError as e: 

2027 raise DatastoreValidationError(e) from e 

2028 

2029 def export(self, refs: Iterable[DatasetRef], *, 

2030 directory: Optional[Union[ButlerURI, str]] = None, 

2031 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

2032 # Docstring inherited from Datastore.export. 

2033 if transfer is not None and directory is None: 2033 ↛ 2034line 2033 didn't jump to line 2034, because the condition on line 2033 was never true

2034 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

2035 "export directory given") 

2036 

2037 # Force the directory to be a URI object 

2038 directoryUri: Optional[ButlerURI] = None 

2039 if directory is not None: 2039 ↛ 2042line 2039 didn't jump to line 2042, because the condition on line 2039 was never false

2040 directoryUri = ButlerURI(directory, forceDirectory=True) 

2041 

2042 if transfer is not None and directoryUri is not None: 2042 ↛ 2047line 2042 didn't jump to line 2047, because the condition on line 2042 was never false

2043 # mypy needs the second test 

2044 if not directoryUri.exists(): 2044 ↛ 2045line 2044 didn't jump to line 2045, because the condition on line 2044 was never true

2045 raise FileNotFoundError(f"Export location {directory} does not exist") 

2046 

2047 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2048 for ref in progress.wrap(refs, "Exporting dataset files"): 

2049 fileLocations = self._get_dataset_locations_info(ref) 

2050 if not fileLocations: 2050 ↛ 2051line 2050 didn't jump to line 2051, because the condition on line 2050 was never true

2051 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2052 # For now we can not export disassembled datasets 

2053 if len(fileLocations) > 1: 2053 ↛ 2054line 2053 didn't jump to line 2054, because the condition on line 2053 was never true

2054 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2055 location, storedFileInfo = fileLocations[0] 

2056 

2057 pathInStore = location.pathInStore.path 

2058 if transfer is None: 2058 ↛ 2062line 2058 didn't jump to line 2062, because the condition on line 2058 was never true

2059 # TODO: do we also need to return the readStorageClass somehow? 

2060 # We will use the path in store directly. If this is an 

2061 # absolute URI, preserve it. 

2062 if location.pathInStore.isabs(): 

2063 pathInStore = str(location.uri) 

2064 elif transfer == "direct": 2064 ↛ 2066line 2064 didn't jump to line 2066, because the condition on line 2064 was never true

2065 # Use full URIs to the remote store in the export 

2066 pathInStore = str(location.uri) 

2067 else: 

2068 # mypy needs help 

2069 assert directoryUri is not None, "directoryUri must be defined to get here" 

2070 storeUri = ButlerURI(location.uri) 

2071 

2072 # if the datastore has an absolute URI to a resource, we 

2073 # have two options: 

2074 # 1. Keep the absolute URI in the exported YAML 

2075 # 2. Allocate a new name in the local datastore and transfer 

2076 # it. 

2077 # For now go with option 2 

2078 if location.pathInStore.isabs(): 2078 ↛ 2079line 2078 didn't jump to line 2079, because the condition on line 2078 was never true

2079 template = self.templates.getTemplate(ref) 

2080 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

2081 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2082 

2083 exportUri = directoryUri.join(pathInStore) 

2084 exportUri.transfer_from(storeUri, transfer=transfer) 

2085 

2086 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2087 

2088 @staticmethod 

2089 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

2090 """Compute the checksum of the supplied file. 

2091 

2092 Parameters 

2093 ---------- 

2094 uri : `ButlerURI` 

2095 Name of resource to calculate checksum from. 

2096 algorithm : `str`, optional 

2097 Name of algorithm to use. Must be one of the algorithms supported 

2098 by :py:class`hashlib`. 

2099 block_size : `int` 

2100 Number of bytes to read from file at one time. 

2101 

2102 Returns 

2103 ------- 

2104 hexdigest : `str` 

2105 Hex digest of the file. 

2106 

2107 Notes 

2108 ----- 

2109 Currently returns None if the URI is for a remote resource. 

2110 """ 

2111 if algorithm not in hashlib.algorithms_guaranteed: 2111 ↛ 2112line 2111 didn't jump to line 2112, because the condition on line 2111 was never true

2112 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2113 

2114 if not uri.isLocal: 2114 ↛ 2115line 2114 didn't jump to line 2115, because the condition on line 2114 was never true

2115 return None 

2116 

2117 hasher = hashlib.new(algorithm) 

2118 

2119 with uri.as_local() as local_uri: 

2120 with open(local_uri.ospath, "rb") as f: 

2121 for chunk in iter(lambda: f.read(block_size), b""): 

2122 hasher.update(chunk) 

2123 

2124 return hasher.hexdigest() 

2125 

2126 def needs_expanded_data_ids( 

2127 self, 

2128 transfer: Optional[str], 

2129 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2130 ) -> bool: 

2131 # Docstring inherited. 

2132 # This _could_ also use entity to inspect whether the filename template 

2133 # involves placeholders other than the required dimensions for its 

2134 # dataset type, but that's not necessary for correctness; it just 

2135 # enables more optimizations (perhaps only in theory). 

2136 return transfer not in ("direct", None)