Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from collections import defaultdict 

35from dataclasses import dataclass 

36from typing import ( 

37 TYPE_CHECKING, 

38 Any, 

39 ClassVar, 

40 Dict, 

41 Iterable, 

42 List, 

43 Mapping, 

44 Optional, 

45 Set, 

46 Tuple, 

47 Type, 

48 Union, 

49) 

50 

51from lsst.daf.butler import ( 

52 ButlerURI, 

53 CompositesMap, 

54 Config, 

55 FileDataset, 

56 DatasetId, 

57 DatasetRef, 

58 DatasetType, 

59 DatasetTypeNotSupportedError, 

60 Datastore, 

61 DatastoreCacheManager, 

62 DatastoreDisabledCacheManager, 

63 DatastoreConfig, 

64 DatastoreValidationError, 

65 FileDescriptor, 

66 FileTemplates, 

67 FileTemplateValidationError, 

68 Formatter, 

69 FormatterFactory, 

70 Location, 

71 LocationFactory, 

72 Progress, 

73 StorageClass, 

74 StoredFileInfo, 

75) 

76 

77from lsst.daf.butler import ddl 

78from lsst.daf.butler.registry.interfaces import ( 

79 ReadOnlyDatabaseError, 

80 DatastoreRegistryBridge, 

81) 

82 

83from lsst.daf.butler.core.repoRelocation import replaceRoot 

84from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

85from .genericDatastore import GenericBaseDatastore 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

90 

91log = logging.getLogger(__name__) 

92 

93 

94class _IngestPrepData(Datastore.IngestPrepData): 

95 """Helper class for FileDatastore ingest implementation. 

96 

97 Parameters 

98 ---------- 

99 datasets : `list` of `FileDataset` 

100 Files to be ingested by this datastore. 

101 """ 

102 def __init__(self, datasets: List[FileDataset]): 

103 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

104 self.datasets = datasets 

105 

106 

107@dataclass(frozen=True) 

108class DatastoreFileGetInformation: 

109 """Collection of useful parameters needed to retrieve a file from 

110 a Datastore. 

111 """ 

112 

113 location: Location 

114 """The location from which to read the dataset.""" 

115 

116 formatter: Formatter 

117 """The `Formatter` to use to deserialize the dataset.""" 

118 

119 info: StoredFileInfo 

120 """Stored information about this file and its formatter.""" 

121 

122 assemblerParams: Dict[str, Any] 

123 """Parameters to use for post-processing the retrieved dataset.""" 

124 

125 formatterParams: Dict[str, Any] 

126 """Parameters that were understood by the associated formatter.""" 

127 

128 component: Optional[str] 

129 """The component to be retrieved (can be `None`).""" 

130 

131 readStorageClass: StorageClass 

132 """The `StorageClass` of the dataset being read.""" 

133 

134 

135class FileDatastore(GenericBaseDatastore): 

136 """Generic Datastore for file-based implementations. 

137 

138 Should always be sub-classed since key abstract methods are missing. 

139 

140 Parameters 

141 ---------- 

142 config : `DatastoreConfig` or `str` 

143 Configuration as either a `Config` object or URI to file. 

144 bridgeManager : `DatastoreRegistryBridgeManager` 

145 Object that manages the interface between `Registry` and datastores. 

146 butlerRoot : `str`, optional 

147 New datastore root to use to override the configuration value. 

148 

149 Raises 

150 ------ 

151 ValueError 

152 If root location does not exist and ``create`` is `False` in the 

153 configuration. 

154 """ 

155 

156 defaultConfigFile: ClassVar[Optional[str]] = None 

157 """Path to configuration defaults. Accessed within the ``config`` resource 

158 or relative to a search path. Can be None if no defaults specified. 

159 """ 

160 

161 root: ButlerURI 

162 """Root directory URI of this `Datastore`.""" 

163 

164 locationFactory: LocationFactory 

165 """Factory for creating locations relative to the datastore root.""" 

166 

167 formatterFactory: FormatterFactory 

168 """Factory for creating instances of formatters.""" 

169 

170 templates: FileTemplates 

171 """File templates that can be used by this `Datastore`.""" 

172 

173 composites: CompositesMap 

174 """Determines whether a dataset should be disassembled on put.""" 

175 

176 defaultConfigFile = "datastores/fileDatastore.yaml" 

177 """Path to configuration defaults. Accessed within the ``config`` resource 

178 or relative to a search path. Can be None if no defaults specified. 

179 """ 

180 

181 @classmethod 

182 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

183 """Set any filesystem-dependent config options for this Datastore to 

184 be appropriate for a new empty repository with the given root. 

185 

186 Parameters 

187 ---------- 

188 root : `str` 

189 URI to the root of the data repository. 

190 config : `Config` 

191 A `Config` to update. Only the subset understood by 

192 this component will be updated. Will not expand 

193 defaults. 

194 full : `Config` 

195 A complete config with all defaults expanded that can be 

196 converted to a `DatastoreConfig`. Read-only and will not be 

197 modified by this method. 

198 Repository-specific options that should not be obtained 

199 from defaults when Butler instances are constructed 

200 should be copied from ``full`` to ``config``. 

201 overwrite : `bool`, optional 

202 If `False`, do not modify a value in ``config`` if the value 

203 already exists. Default is always to overwrite with the provided 

204 ``root``. 

205 

206 Notes 

207 ----- 

208 If a keyword is explicitly defined in the supplied ``config`` it 

209 will not be overridden by this method if ``overwrite`` is `False`. 

210 This allows explicit values set in external configs to be retained. 

211 """ 

212 Config.updateParameters(DatastoreConfig, config, full, 

213 toUpdate={"root": root}, 

214 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

215 

216 @classmethod 

217 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

218 return ddl.TableSpec( 

219 fields=[ 

220 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

221 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

222 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

223 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

224 # Use empty string to indicate no component 

225 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

226 # TODO: should checksum be Base64Bytes instead? 

227 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

228 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

229 ], 

230 unique=frozenset(), 

231 indexes=[tuple(["path"])], 

232 ) 

233 

234 def __init__(self, config: Union[DatastoreConfig, str], 

235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

236 super().__init__(config, bridgeManager) 

237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise ValueError("No root directory specified in configuration") 

239 

240 # Name ourselves either using an explicit name or a name 

241 # derived from the (unexpanded) root 

242 if "name" in self.config: 

243 self.name = self.config["name"] 

244 else: 

245 # We use the unexpanded root in the name to indicate that this 

246 # datastore can be moved without having to update registry. 

247 self.name = "{}@{}".format(type(self).__name__, 

248 self.config["root"]) 

249 

250 # Support repository relocation in config 

251 # Existence of self.root is checked in subclass 

252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

253 forceDirectory=True, forceAbsolute=True) 

254 

255 self.locationFactory = LocationFactory(self.root) 

256 self.formatterFactory = FormatterFactory() 

257 

258 # Now associate formatters with storage classes 

259 self.formatterFactory.registerFormatters(self.config["formatters"], 

260 universe=bridgeManager.universe) 

261 

262 # Read the file naming templates 

263 self.templates = FileTemplates(self.config["templates"], 

264 universe=bridgeManager.universe) 

265 

266 # See if composites should be disassembled 

267 self.composites = CompositesMap(self.config["composites"], 

268 universe=bridgeManager.universe) 

269 

270 tableName = self.config["records", "table"] 

271 try: 

272 # Storage of paths and formatters, keyed by dataset_id 

273 self._table = bridgeManager.opaque.register( 

274 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType)) 

275 # Interface to Registry. 

276 self._bridge = bridgeManager.register(self.name) 

277 except ReadOnlyDatabaseError: 

278 # If the database is read only and we just tried and failed to 

279 # create a table, it means someone is trying to create a read-only 

280 # butler client for an empty repo. That should be okay, as long 

281 # as they then try to get any datasets before some other client 

282 # creates the table. Chances are they'rejust validating 

283 # configuration. 

284 pass 

285 

286 # Determine whether checksums should be used - default to False 

287 self.useChecksum = self.config.get("checksum", False) 

288 

289 # Determine whether we can fall back to configuration if a 

290 # requested dataset is not known to registry 

291 self.trustGetRequest = self.config.get("trust_get_request", False) 

292 

293 # Create a cache manager 

294 self.cacheManager: AbstractDatastoreCacheManager 

295 if "cached" in self.config: 295 ↛ 299line 295 didn't jump to line 299, because the condition on line 295 was never false

296 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

297 universe=bridgeManager.universe) 

298 else: 

299 self.cacheManager = DatastoreDisabledCacheManager("", 

300 universe=bridgeManager.universe) 

301 

302 # Check existence and create directory structure if necessary 

303 if not self.root.exists(): 

304 if "create" not in self.config or not self.config["create"]: 304 ↛ 305line 304 didn't jump to line 305, because the condition on line 304 was never true

305 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

306 try: 

307 self.root.mkdir() 

308 except Exception as e: 

309 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

310 f" Got error: {e}") from e 

311 

312 def __str__(self) -> str: 

313 return str(self.root) 

314 

315 @property 

316 def bridge(self) -> DatastoreRegistryBridge: 

317 return self._bridge 

318 

319 def _artifact_exists(self, location: Location) -> bool: 

320 """Check that an artifact exists in this datastore at the specified 

321 location. 

322 

323 Parameters 

324 ---------- 

325 location : `Location` 

326 Expected location of the artifact associated with this datastore. 

327 

328 Returns 

329 ------- 

330 exists : `bool` 

331 True if the location can be found, false otherwise. 

332 """ 

333 log.debug("Checking if resource exists: %s", location.uri) 

334 return location.uri.exists() 

335 

336 def _delete_artifact(self, location: Location) -> None: 

337 """Delete the artifact from the datastore. 

338 

339 Parameters 

340 ---------- 

341 location : `Location` 

342 Location of the artifact associated with this datastore. 

343 """ 

344 if location.pathInStore.isabs(): 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true

345 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

346 

347 try: 

348 location.uri.remove() 

349 except FileNotFoundError: 

350 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

351 raise 

352 except Exception as e: 

353 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

354 raise 

355 log.debug("Successfully deleted file: %s", location.uri) 

356 

357 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

358 # Docstring inherited from GenericBaseDatastore 

359 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

360 self._table.insert(*records) 

361 

362 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

363 # Docstring inherited from GenericBaseDatastore 

364 

365 # Look for the dataset_id -- there might be multiple matches 

366 # if we have disassembled the dataset. 

367 records = self._table.fetch(dataset_id=ref.id) 

368 return [StoredFileInfo.from_record(record) for record in records] 

369 

370 def _get_stored_records_associated_with_refs(self, 

371 refs: Iterable[DatasetIdRef] 

372 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

373 """Retrieve all records associated with the provided refs. 

374 

375 Parameters 

376 ---------- 

377 refs : iterable of `DatasetIdRef` 

378 The refs for which records are to be retrieved. 

379 

380 Returns 

381 ------- 

382 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

383 The matching records indexed by the ref ID. The number of entries 

384 in the dict can be smaller than the number of requested refs. 

385 """ 

386 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

387 

388 # Uniqueness is dataset_id + component so can have multiple records 

389 # per ref. 

390 records_by_ref = defaultdict(list) 

391 for record in records: 

392 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

393 return records_by_ref 

394 

395 def _refs_associated_with_artifacts(self, paths: List[Union[str, ButlerURI]]) -> Dict[str, 

396 Set[DatasetId]]: 

397 """Return paths and associated dataset refs. 

398 

399 Parameters 

400 ---------- 

401 paths : `list` of `str` or `ButlerURI` 

402 All the paths to include in search. 

403 

404 Returns 

405 ------- 

406 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

407 Mapping of each path to a set of associated database IDs. 

408 """ 

409 records = self._table.fetch(path=[str(path) for path in paths]) 

410 result = defaultdict(set) 

411 for row in records: 

412 result[row["path"]].add(row["dataset_id"]) 

413 return result 

414 

415 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[DatasetId]: 

416 """Return all dataset refs associated with the supplied path. 

417 

418 Parameters 

419 ---------- 

420 pathInStore : `ButlerURI` 

421 Path of interest in the data store. 

422 

423 Returns 

424 ------- 

425 ids : `set` of `int` 

426 All `DatasetRef` IDs associated with this path. 

427 """ 

428 records = list(self._table.fetch(path=str(pathInStore))) 

429 ids = {r["dataset_id"] for r in records} 

430 return ids 

431 

432 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

433 # Docstring inherited from GenericBaseDatastore 

434 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

435 

436 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

437 r"""Find all the `Location`\ s of the requested dataset in the 

438 `Datastore` and the associated stored file information. 

439 

440 Parameters 

441 ---------- 

442 ref : `DatasetRef` 

443 Reference to the required `Dataset`. 

444 

445 Returns 

446 ------- 

447 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

448 Location of the dataset within the datastore and 

449 stored information about each file and its formatter. 

450 """ 

451 # Get the file information (this will fail if no file) 

452 records = self.getStoredItemsInfo(ref) 

453 

454 # Use the path to determine the location -- we need to take 

455 # into account absolute URIs in the datastore record 

456 return [(r.file_location(self.locationFactory), r) for r in records] 

457 

458 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

459 """Check that there is only one dataset associated with the 

460 specified artifact. 

461 

462 Parameters 

463 ---------- 

464 ref : `DatasetRef` or `FakeDatasetRef` 

465 Dataset to be removed. 

466 location : `Location` 

467 The location of the artifact to be removed. 

468 

469 Returns 

470 ------- 

471 can_remove : `Bool` 

472 True if the artifact can be safely removed. 

473 """ 

474 # Can't ever delete absolute URIs. 

475 if location.pathInStore.isabs(): 

476 return False 

477 

478 # Get all entries associated with this path 

479 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

480 if not allRefs: 

481 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

482 

483 # Remove these refs from all the refs and if there is nothing left 

484 # then we can delete 

485 remainingRefs = allRefs - {ref.id} 

486 

487 if remainingRefs: 

488 return False 

489 return True 

490 

491 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

492 StoredFileInfo]]: 

493 """Predict the location and related file information of the requested 

494 dataset in this datastore. 

495 

496 Parameters 

497 ---------- 

498 ref : `DatasetRef` 

499 Reference to the required `Dataset`. 

500 

501 Returns 

502 ------- 

503 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

504 Expected Location of the dataset within the datastore and 

505 placeholder information about each file and its formatter. 

506 

507 Notes 

508 ----- 

509 Uses the current configuration to determine how we would expect the 

510 datastore files to have been written if we couldn't ask registry. 

511 This is safe so long as there has been no change to datastore 

512 configuration between writing the dataset and wanting to read it. 

513 Will not work for files that have been ingested without using the 

514 standard file template or default formatter. 

515 """ 

516 

517 # If we have a component ref we always need to ask the questions 

518 # of the composite. If the composite is disassembled this routine 

519 # should return all components. If the composite was not 

520 # disassembled the composite is what is stored regardless of 

521 # component request. Note that if the caller has disassembled 

522 # a composite there is no way for this guess to know that 

523 # without trying both the composite and component ref and seeing 

524 # if there is something at the component Location even without 

525 # disassembly being enabled. 

526 if ref.datasetType.isComponent(): 

527 ref = ref.makeCompositeRef() 

528 

529 # See if the ref is a composite that should be disassembled 

530 doDisassembly = self.composites.shouldBeDisassembled(ref) 

531 

532 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

533 

534 if doDisassembly: 

535 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

536 compRef = ref.makeComponentRef(component) 

537 location, formatter = self._determine_put_formatter_location(compRef) 

538 all_info.append((location, formatter, componentStorage, component)) 

539 

540 else: 

541 # Always use the composite ref if no disassembly 

542 location, formatter = self._determine_put_formatter_location(ref) 

543 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

544 

545 # Convert the list of tuples to have StoredFileInfo as second element 

546 return [(location, StoredFileInfo(formatter=formatter, 

547 path=location.pathInStore.path, 

548 storageClass=storageClass, 

549 component=component, 

550 checksum=None, 

551 file_size=-1)) 

552 for location, formatter, storageClass, component in all_info] 

553 

554 def _prepare_for_get(self, ref: DatasetRef, 

555 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

556 """Check parameters for ``get`` and obtain formatter and 

557 location. 

558 

559 Parameters 

560 ---------- 

561 ref : `DatasetRef` 

562 Reference to the required Dataset. 

563 parameters : `dict` 

564 `StorageClass`-specific parameters that specify, for example, 

565 a slice of the dataset to be loaded. 

566 

567 Returns 

568 ------- 

569 getInfo : `list` [`DatastoreFileGetInformation`] 

570 Parameters needed to retrieve each file. 

571 """ 

572 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

573 

574 # Get file metadata and internal metadata 

575 fileLocations = self._get_dataset_locations_info(ref) 

576 if not fileLocations: 

577 if not self.trustGetRequest: 

578 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

579 # Assume the dataset is where we think it should be 

580 fileLocations = self._get_expected_dataset_locations_info(ref) 

581 

582 # The storage class we want to use eventually 

583 refStorageClass = ref.datasetType.storageClass 

584 

585 if len(fileLocations) > 1: 

586 disassembled = True 

587 else: 

588 disassembled = False 

589 

590 # Is this a component request? 

591 refComponent = ref.datasetType.component() 

592 

593 fileGetInfo = [] 

594 for location, storedFileInfo in fileLocations: 

595 

596 # The storage class used to write the file 

597 writeStorageClass = storedFileInfo.storageClass 

598 

599 # If this has been disassembled we need read to match the write 

600 if disassembled: 

601 readStorageClass = writeStorageClass 

602 else: 

603 readStorageClass = refStorageClass 

604 

605 formatter = getInstanceOf(storedFileInfo.formatter, 

606 FileDescriptor(location, readStorageClass=readStorageClass, 

607 storageClass=writeStorageClass, parameters=parameters), 

608 ref.dataId) 

609 

610 formatterParams, notFormatterParams = formatter.segregateParameters() 

611 

612 # Of the remaining parameters, extract the ones supported by 

613 # this StorageClass (for components not all will be handled) 

614 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

615 

616 # The ref itself could be a component if the dataset was 

617 # disassembled by butler, or we disassembled in datastore and 

618 # components came from the datastore records 

619 component = storedFileInfo.component if storedFileInfo.component else refComponent 

620 

621 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

622 assemblerParams, formatterParams, 

623 component, readStorageClass)) 

624 

625 return fileGetInfo 

626 

627 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

628 """Check the arguments for ``put`` and obtain formatter and 

629 location. 

630 

631 Parameters 

632 ---------- 

633 inMemoryDataset : `object` 

634 The dataset to store. 

635 ref : `DatasetRef` 

636 Reference to the associated Dataset. 

637 

638 Returns 

639 ------- 

640 location : `Location` 

641 The location to write the dataset. 

642 formatter : `Formatter` 

643 The `Formatter` to use to write the dataset. 

644 

645 Raises 

646 ------ 

647 TypeError 

648 Supplied object and storage class are inconsistent. 

649 DatasetTypeNotSupportedError 

650 The associated `DatasetType` is not handled by this datastore. 

651 """ 

652 self._validate_put_parameters(inMemoryDataset, ref) 

653 return self._determine_put_formatter_location(ref) 

654 

655 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

656 """Calculate the formatter and output location to use for put. 

657 

658 Parameters 

659 ---------- 

660 ref : `DatasetRef` 

661 Reference to the associated Dataset. 

662 

663 Returns 

664 ------- 

665 location : `Location` 

666 The location to write the dataset. 

667 formatter : `Formatter` 

668 The `Formatter` to use to write the dataset. 

669 """ 

670 # Work out output file name 

671 try: 

672 template = self.templates.getTemplate(ref) 

673 except KeyError as e: 

674 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

675 

676 # Validate the template to protect against filenames from different 

677 # dataIds returning the same and causing overwrite confusion. 

678 template.validateTemplate(ref) 

679 

680 location = self.locationFactory.fromPath(template.format(ref)) 

681 

682 # Get the formatter based on the storage class 

683 storageClass = ref.datasetType.storageClass 

684 try: 

685 formatter = self.formatterFactory.getFormatter(ref, 

686 FileDescriptor(location, 

687 storageClass=storageClass), 

688 ref.dataId) 

689 except KeyError as e: 

690 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

691 f"{self.name}") from e 

692 

693 # Now that we know the formatter, update the location 

694 location = formatter.makeUpdatedLocation(location) 

695 

696 return location, formatter 

697 

698 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

699 # Docstring inherited from base class 

700 if transfer != "auto": 

701 return transfer 

702 

703 # See if the paths are within the datastore or not 

704 inside = [self._pathInStore(d.path) is not None for d in datasets] 

705 

706 if all(inside): 

707 transfer = None 

708 elif not any(inside): 708 ↛ 712line 708 didn't jump to line 712, because the condition on line 708 was never false

709 # Allow ButlerURI to use its own knowledge 

710 transfer = "auto" 

711 else: 

712 raise ValueError("Some datasets are inside the datastore and some are outside." 

713 " Please use an explicit transfer mode and not 'auto'.") 

714 

715 return transfer 

716 

717 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

718 """Return path relative to datastore root 

719 

720 Parameters 

721 ---------- 

722 path : `str` or `ButlerURI` 

723 Path to dataset. Can be absolute URI. If relative assumed to 

724 be relative to the datastore. Returns path in datastore 

725 or raises an exception if the path it outside. 

726 

727 Returns 

728 ------- 

729 inStore : `str` 

730 Path relative to datastore root. Returns `None` if the file is 

731 outside the root. 

732 """ 

733 # Relative path will always be relative to datastore 

734 pathUri = ButlerURI(path, forceAbsolute=False) 

735 return pathUri.relative_to(self.root) 

736 

737 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

738 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

739 """Standardize the path of a to-be-ingested file. 

740 

741 Parameters 

742 ---------- 

743 path : `str` or `ButlerURI` 

744 Path of a file to be ingested. 

745 transfer : `str`, optional 

746 How (and whether) the dataset should be added to the datastore. 

747 See `ingest` for details of transfer modes. 

748 This implementation is provided only so 

749 `NotImplementedError` can be raised if the mode is not supported; 

750 actual transfers are deferred to `_extractIngestInfo`. 

751 

752 Returns 

753 ------- 

754 path : `str` or `ButlerURI` 

755 New path in what the datastore considers standard form. If an 

756 absolute URI was given that will be returned unchanged. 

757 

758 Notes 

759 ----- 

760 Subclasses of `FileDatastore` can implement this method instead 

761 of `_prepIngest`. It should not modify the data repository or given 

762 file in any way. 

763 

764 Raises 

765 ------ 

766 NotImplementedError 

767 Raised if the datastore does not support the given transfer mode 

768 (including the case where ingest is not supported at all). 

769 FileNotFoundError 

770 Raised if one of the given files does not exist. 

771 """ 

772 if transfer not in (None, "direct") + self.root.transferModes: 772 ↛ 773line 772 didn't jump to line 773, because the condition on line 772 was never true

773 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

774 

775 # A relative URI indicates relative to datastore root 

776 srcUri = ButlerURI(path, forceAbsolute=False) 

777 if not srcUri.isabs(): 

778 srcUri = self.root.join(path) 

779 

780 if not srcUri.exists(): 

781 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

782 f"are assumed to be relative to {self.root} unless they are absolute.") 

783 

784 if transfer is None: 

785 relpath = srcUri.relative_to(self.root) 

786 if not relpath: 

787 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

788 f"within datastore ({self.root})") 

789 

790 # Return the relative path within the datastore for internal 

791 # transfer 

792 path = relpath 

793 

794 return path 

795 

796 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

797 formatter: Union[Formatter, Type[Formatter]], 

798 transfer: Optional[str] = None) -> StoredFileInfo: 

799 """Relocate (if necessary) and extract `StoredFileInfo` from a 

800 to-be-ingested file. 

801 

802 Parameters 

803 ---------- 

804 path : `str` or `ButlerURI` 

805 URI or path of a file to be ingested. 

806 ref : `DatasetRef` 

807 Reference for the dataset being ingested. Guaranteed to have 

808 ``dataset_id not None`. 

809 formatter : `type` or `Formatter` 

810 `Formatter` subclass to use for this dataset or an instance. 

811 transfer : `str`, optional 

812 How (and whether) the dataset should be added to the datastore. 

813 See `ingest` for details of transfer modes. 

814 

815 Returns 

816 ------- 

817 info : `StoredFileInfo` 

818 Internal datastore record for this file. This will be inserted by 

819 the caller; the `_extractIngestInfo` is only resposible for 

820 creating and populating the struct. 

821 

822 Raises 

823 ------ 

824 FileNotFoundError 

825 Raised if one of the given files does not exist. 

826 FileExistsError 

827 Raised if transfer is not `None` but the (internal) location the 

828 file would be moved to is already occupied. 

829 """ 

830 if self._transaction is None: 830 ↛ 831line 830 didn't jump to line 831, because the condition on line 830 was never true

831 raise RuntimeError("Ingest called without transaction enabled") 

832 

833 # Create URI of the source path, do not need to force a relative 

834 # path to absolute. 

835 srcUri = ButlerURI(path, forceAbsolute=False) 

836 

837 # Track whether we have read the size of the source yet 

838 have_sized = False 

839 

840 tgtLocation: Optional[Location] 

841 if transfer is None: 

842 # A relative path is assumed to be relative to the datastore 

843 # in this context 

844 if not srcUri.isabs(): 

845 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

846 else: 

847 # Work out the path in the datastore from an absolute URI 

848 # This is required to be within the datastore. 

849 pathInStore = srcUri.relative_to(self.root) 

850 if pathInStore is None: 850 ↛ 851line 850 didn't jump to line 851, because the condition on line 850 was never true

851 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

852 f"not within datastore {self.root}") 

853 tgtLocation = self.locationFactory.fromPath(pathInStore) 

854 elif transfer == "direct": 854 ↛ 859line 854 didn't jump to line 859, because the condition on line 854 was never true

855 # Want to store the full URI to the resource directly in 

856 # datastore. This is useful for referring to permanent archive 

857 # storage for raw data. 

858 # Trust that people know what they are doing. 

859 tgtLocation = None 

860 else: 

861 # Work out the name we want this ingested file to have 

862 # inside the datastore 

863 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

864 if not tgtLocation.uri.dirname().exists(): 

865 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

866 tgtLocation.uri.dirname().mkdir() 

867 

868 # if we are transferring from a local file to a remote location 

869 # it may be more efficient to get the size and checksum of the 

870 # local file rather than the transferred one 

871 if not srcUri.scheme or srcUri.scheme == "file": 871 ↛ 877line 871 didn't jump to line 877, because the condition on line 871 was never false

872 size = srcUri.size() 

873 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

874 have_sized = True 

875 

876 # transfer the resource to the destination 

877 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

878 

879 if tgtLocation is None: 879 ↛ 881line 879 didn't jump to line 881, because the condition on line 879 was never true

880 # This means we are using direct mode 

881 targetUri = srcUri 

882 targetPath = str(srcUri) 

883 else: 

884 targetUri = tgtLocation.uri 

885 targetPath = tgtLocation.pathInStore.path 

886 

887 # the file should exist in the datastore now 

888 if not have_sized: 

889 size = targetUri.size() 

890 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

891 

892 return StoredFileInfo(formatter=formatter, path=targetPath, 

893 storageClass=ref.datasetType.storageClass, 

894 component=ref.datasetType.component(), 

895 file_size=size, checksum=checksum) 

896 

897 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

898 # Docstring inherited from Datastore._prepIngest. 

899 filtered = [] 

900 for dataset in datasets: 

901 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

902 if not acceptable: 

903 continue 

904 else: 

905 dataset.refs = acceptable 

906 if dataset.formatter is None: 

907 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

908 else: 

909 assert isinstance(dataset.formatter, (type, str)) 

910 dataset.formatter = getClassOf(dataset.formatter) 

911 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

912 filtered.append(dataset) 

913 return _IngestPrepData(filtered) 

914 

915 @transactional 

916 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

917 # Docstring inherited from Datastore._finishIngest. 

918 refsAndInfos = [] 

919 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

920 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

921 # Do ingest as if the first dataset ref is associated with the file 

922 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

923 transfer=transfer) 

924 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

925 self._register_datasets(refsAndInfos) 

926 

927 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

928 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

929 """Given a source URI and a DatasetRef, determine the name the 

930 dataset will have inside datastore. 

931 

932 Parameters 

933 ---------- 

934 srcUri : `ButlerURI` 

935 URI to the source dataset file. 

936 ref : `DatasetRef` 

937 Ref associated with the newly-ingested dataset artifact. This 

938 is used to determine the name within the datastore. 

939 formatter : `Formatter` or Formatter class. 

940 Formatter to use for validation. Can be a class or an instance. 

941 

942 Returns 

943 ------- 

944 location : `Location` 

945 Target location for the newly-ingested dataset. 

946 """ 

947 # Ingesting a file from outside the datastore. 

948 # This involves a new name. 

949 template = self.templates.getTemplate(ref) 

950 location = self.locationFactory.fromPath(template.format(ref)) 

951 

952 # Get the extension 

953 ext = srcUri.getExtension() 

954 

955 # Update the destination to include that extension 

956 location.updateExtension(ext) 

957 

958 # Ask the formatter to validate this extension 

959 formatter.validateExtension(location) 

960 

961 return location 

962 

963 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

964 """Write out in memory dataset to datastore. 

965 

966 Parameters 

967 ---------- 

968 inMemoryDataset : `object` 

969 Dataset to write to datastore. 

970 ref : `DatasetRef` 

971 Registry information associated with this dataset. 

972 

973 Returns 

974 ------- 

975 info : `StoredFileInfo` 

976 Information describin the artifact written to the datastore. 

977 """ 

978 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

979 uri = location.uri 

980 

981 if not uri.dirname().exists(): 

982 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

983 uri.dirname().mkdir() 

984 

985 if self._transaction is None: 985 ↛ 986line 985 didn't jump to line 986, because the condition on line 985 was never true

986 raise RuntimeError("Attempting to write artifact without transaction enabled") 

987 

988 def _removeFileExists(uri: ButlerURI) -> None: 

989 """Remove a file and do not complain if it is not there. 

990 

991 This is important since a formatter might fail before the file 

992 is written and we should not confuse people by writing spurious 

993 error messages to the log. 

994 """ 

995 try: 

996 uri.remove() 

997 except FileNotFoundError: 

998 pass 

999 

1000 # Register a callback to try to delete the uploaded data if 

1001 # something fails below 

1002 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1003 

1004 # For a local file, simply use the formatter directly 

1005 if uri.isLocal: 

1006 try: 

1007 formatter.write(inMemoryDataset) 

1008 except Exception as e: 

1009 raise RuntimeError(f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} " 

1010 f"to location {uri}") from e 

1011 log.debug("Successfully wrote python object to local file at %s", uri) 

1012 else: 

1013 # This is a remote URI, so first try bytes and write directly else 

1014 # fallback to a temporary file 

1015 try: 

1016 serializedDataset = formatter.toBytes(inMemoryDataset) 

1017 except NotImplementedError: 1017 ↛ 1036line 1017 didn't jump to line 1036

1018 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

1019 # Need to configure the formatter to write to a different 

1020 # location and that needs us to overwrite internals 

1021 tmpLocation = Location(*os.path.split(tmpFile.name)) 

1022 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

1023 with formatter._updateLocation(tmpLocation): 

1024 try: 

1025 formatter.write(inMemoryDataset) 

1026 except Exception as e: 

1027 raise RuntimeError(f"Failed to serialize dataset {ref} of type" 

1028 f" {type(inMemoryDataset)} to " 

1029 f"temporary location {tmpLocation.uri}") from e 

1030 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

1031 

1032 # Cache if required 

1033 self.cacheManager.move_to_cache(tmpLocation.uri, ref) 

1034 

1035 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1036 except Exception as e: 

1037 raise RuntimeError(f"Failed to serialize dataset {ref} to bytes.") from e 

1038 else: 

1039 log.debug("Writing bytes directly to %s", uri) 

1040 uri.write(serializedDataset, overwrite=True) 

1041 log.debug("Successfully wrote bytes directly to %s", uri) 

1042 

1043 # URI is needed to resolve what ingest case are we dealing with 

1044 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1045 

1046 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1047 ref: DatasetRef, isComponent: bool = False) -> Any: 

1048 """Read the artifact from datastore into in memory object. 

1049 

1050 Parameters 

1051 ---------- 

1052 getInfo : `DatastoreFileGetInformation` 

1053 Information about the artifact within the datastore. 

1054 ref : `DatasetRef` 

1055 The registry information associated with this artifact. 

1056 isComponent : `bool` 

1057 Flag to indicate if a component is being read from this artifact. 

1058 

1059 Returns 

1060 ------- 

1061 inMemoryDataset : `object` 

1062 The artifact as a python object. 

1063 """ 

1064 location = getInfo.location 

1065 uri = location.uri 

1066 log.debug("Accessing data from %s", uri) 

1067 

1068 # Cannot recalculate checksum but can compare size as a quick check 

1069 # Do not do this if the size is negative since that indicates 

1070 # we do not know. 

1071 recorded_size = getInfo.info.file_size 

1072 resource_size = uri.size() 

1073 if recorded_size >= 0 and resource_size != recorded_size: 1073 ↛ 1074line 1073 didn't jump to line 1074, because the condition on line 1073 was never true

1074 raise RuntimeError("Integrity failure in Datastore. " 

1075 f"Size of file {uri} ({resource_size}) " 

1076 f"does not match size recorded in registry of {recorded_size}") 

1077 

1078 # For the general case we have choices for how to proceed. 

1079 # 1. Always use a local file (downloading the remote resource to a 

1080 # temporary file if needed). 

1081 # 2. Use a threshold size and read into memory and use bytes. 

1082 # Use both for now with an arbitrary hand off size. 

1083 # This allows small datasets to be downloaded from remote object 

1084 # stores without requiring a temporary file. 

1085 

1086 formatter = getInfo.formatter 

1087 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1088 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1089 serializedDataset = uri.read() 

1090 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1091 f"component {getInfo.component}" if isComponent else "", 

1092 len(serializedDataset), uri, formatter.name()) 

1093 try: 

1094 result = formatter.fromBytes(serializedDataset, 

1095 component=getInfo.component if isComponent else None) 

1096 except Exception as e: 

1097 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1098 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1099 else: 

1100 # Read from file. 

1101 

1102 # Have to update the Location associated with the formatter 

1103 # because formatter.read does not allow an override. 

1104 # This could be improved. 

1105 location_updated = False 

1106 msg = "" 

1107 

1108 # First check in cache for local version. 

1109 # The cache will only be relevant for remote resources. 

1110 if not uri.isLocal: 

1111 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension()) 

1112 if cached_file is not None: 1112 ↛ 1113line 1112 didn't jump to line 1113, because the condition on line 1112 was never true

1113 msg = f"(via cache read of remote file {uri})" 

1114 uri = cached_file 

1115 location_updated = True 

1116 

1117 with uri.as_local() as local_uri: 

1118 

1119 # URI was remote and file was downloaded 

1120 if uri != local_uri: 

1121 cache_msg = "" 

1122 location_updated = True 

1123 

1124 # Cache the downloaded file if needed. 

1125 cached_uri = self.cacheManager.move_to_cache(local_uri, ref) 

1126 if cached_uri is not None: 1126 ↛ 1127line 1126 didn't jump to line 1127, because the condition on line 1126 was never true

1127 local_uri = cached_uri 

1128 cache_msg = " and cached" 

1129 

1130 msg = f"(via download to local file{cache_msg})" 

1131 

1132 # Calculate the (possibly) new location for the formatter 

1133 # to use. 

1134 newLocation = Location(*local_uri.split()) if location_updated else None 

1135 

1136 log.debug("Reading%s from location %s %s with formatter %s", 

1137 f" component {getInfo.component}" if isComponent else "", 

1138 uri, msg, formatter.name()) 

1139 try: 

1140 with formatter._updateLocation(newLocation): 

1141 result = formatter.read(component=getInfo.component if isComponent else None) 

1142 except Exception as e: 

1143 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1144 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1145 

1146 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1147 isComponent=isComponent) 

1148 

1149 def knows(self, ref: DatasetRef) -> bool: 

1150 """Check if the dataset is known to the datastore. 

1151 

1152 Does not check for existence of any artifact. 

1153 

1154 Parameters 

1155 ---------- 

1156 ref : `DatasetRef` 

1157 Reference to the required dataset. 

1158 

1159 Returns 

1160 ------- 

1161 exists : `bool` 

1162 `True` if the dataset is known to the datastore. 

1163 """ 

1164 fileLocations = self._get_dataset_locations_info(ref) 

1165 if fileLocations: 

1166 return True 

1167 return False 

1168 

1169 def exists(self, ref: DatasetRef) -> bool: 

1170 """Check if the dataset exists in the datastore. 

1171 

1172 Parameters 

1173 ---------- 

1174 ref : `DatasetRef` 

1175 Reference to the required dataset. 

1176 

1177 Returns 

1178 ------- 

1179 exists : `bool` 

1180 `True` if the entity exists in the `Datastore`. 

1181 """ 

1182 fileLocations = self._get_dataset_locations_info(ref) 

1183 

1184 # if we are being asked to trust that registry might not be correct 

1185 # we ask for the expected locations and check them explicitly 

1186 if not fileLocations: 

1187 if not self.trustGetRequest: 

1188 return False 

1189 fileLocations = self._get_expected_dataset_locations_info(ref) 

1190 for location, _ in fileLocations: 

1191 if not self._artifact_exists(location): 

1192 return False 

1193 

1194 return True 

1195 

1196 def getURIs(self, ref: DatasetRef, 

1197 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1198 """Return URIs associated with dataset. 

1199 

1200 Parameters 

1201 ---------- 

1202 ref : `DatasetRef` 

1203 Reference to the required dataset. 

1204 predict : `bool`, optional 

1205 If the datastore does not know about the dataset, should it 

1206 return a predicted URI or not? 

1207 

1208 Returns 

1209 ------- 

1210 primary : `ButlerURI` 

1211 The URI to the primary artifact associated with this dataset. 

1212 If the dataset was disassembled within the datastore this 

1213 may be `None`. 

1214 components : `dict` 

1215 URIs to any components associated with the dataset artifact. 

1216 Can be empty if there are no components. 

1217 """ 

1218 

1219 primary: Optional[ButlerURI] = None 

1220 components: Dict[str, ButlerURI] = {} 

1221 

1222 # if this has never been written then we have to guess 

1223 if not self.exists(ref): 

1224 if not predict: 

1225 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1226 

1227 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1228 

1229 if doDisassembly: 

1230 

1231 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1232 compRef = ref.makeComponentRef(component) 

1233 compLocation, _ = self._determine_put_formatter_location(compRef) 

1234 

1235 # Add a URI fragment to indicate this is a guess 

1236 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1237 

1238 else: 

1239 

1240 location, _ = self._determine_put_formatter_location(ref) 

1241 

1242 # Add a URI fragment to indicate this is a guess 

1243 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1244 

1245 return primary, components 

1246 

1247 # If this is a ref that we have written we can get the path. 

1248 # Get file metadata and internal metadata 

1249 fileLocations = self._get_dataset_locations_info(ref) 

1250 

1251 guessing = False 

1252 if not fileLocations: 

1253 if not self.trustGetRequest: 1253 ↛ 1254line 1253 didn't jump to line 1254, because the condition on line 1253 was never true

1254 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1255 fileLocations = self._get_expected_dataset_locations_info(ref) 

1256 guessing = True 

1257 

1258 if len(fileLocations) == 1: 

1259 # No disassembly so this is the primary URI 

1260 uri = fileLocations[0][0].uri 

1261 if guessing and not uri.exists(): 1261 ↛ 1262line 1261 didn't jump to line 1262, because the condition on line 1261 was never true

1262 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1263 primary = uri 

1264 

1265 else: 

1266 for location, storedFileInfo in fileLocations: 

1267 if storedFileInfo.component is None: 1267 ↛ 1268line 1267 didn't jump to line 1268, because the condition on line 1267 was never true

1268 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1269 uri = location.uri 

1270 if guessing and not uri.exists(): 1270 ↛ 1271line 1270 didn't jump to line 1271, because the condition on line 1270 was never true

1271 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1272 components[storedFileInfo.component] = uri 

1273 

1274 return primary, components 

1275 

1276 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1277 """URI to the Dataset. 

1278 

1279 Parameters 

1280 ---------- 

1281 ref : `DatasetRef` 

1282 Reference to the required Dataset. 

1283 predict : `bool` 

1284 If `True`, allow URIs to be returned of datasets that have not 

1285 been written. 

1286 

1287 Returns 

1288 ------- 

1289 uri : `str` 

1290 URI pointing to the dataset within the datastore. If the 

1291 dataset does not exist in the datastore, and if ``predict`` is 

1292 `True`, the URI will be a prediction and will include a URI 

1293 fragment "#predicted". 

1294 If the datastore does not have entities that relate well 

1295 to the concept of a URI the returned URI will be 

1296 descriptive. The returned URI is not guaranteed to be obtainable. 

1297 

1298 Raises 

1299 ------ 

1300 FileNotFoundError 

1301 Raised if a URI has been requested for a dataset that does not 

1302 exist and guessing is not allowed. 

1303 RuntimeError 

1304 Raised if a request is made for a single URI but multiple URIs 

1305 are associated with this dataset. 

1306 

1307 Notes 

1308 ----- 

1309 When a predicted URI is requested an attempt will be made to form 

1310 a reasonable URI based on file templates and the expected formatter. 

1311 """ 

1312 primary, components = self.getURIs(ref, predict) 

1313 if primary is None or components: 1313 ↛ 1314line 1313 didn't jump to line 1314, because the condition on line 1313 was never true

1314 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1315 "Use Dataastore.getURIs() instead.") 

1316 return primary 

1317 

1318 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1319 destination: ButlerURI, transfer: str = "auto", 

1320 preserve_path: bool = True, 

1321 overwrite: bool = False) -> List[ButlerURI]: 

1322 """Retrieve the file artifacts associated with the supplied refs. 

1323 

1324 Parameters 

1325 ---------- 

1326 refs : iterable of `DatasetRef` 

1327 The datasets for which file artifacts are to be retrieved. 

1328 A single ref can result in multiple files. The refs must 

1329 be resolved. 

1330 destination : `ButlerURI` 

1331 Location to write the file artifacts. 

1332 transfer : `str`, optional 

1333 Method to use to transfer the artifacts. Must be one of the options 

1334 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1335 preserve_path : `bool`, optional 

1336 If `True` the full path of the file artifact within the datastore 

1337 is preserved. If `False` the final file component of the path 

1338 is used. 

1339 overwrite : `bool`, optional 

1340 If `True` allow transfers to overwrite existing files at the 

1341 destination. 

1342 

1343 Returns 

1344 ------- 

1345 targets : `list` of `ButlerURI` 

1346 URIs of file artifacts in destination location. Order is not 

1347 preserved. 

1348 """ 

1349 if not destination.isdir(): 1349 ↛ 1350line 1349 didn't jump to line 1350, because the condition on line 1349 was never true

1350 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1351 

1352 if transfer == "move": 

1353 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1354 

1355 # Source -> Destination 

1356 # This also helps filter out duplicate DatasetRef in the request 

1357 # that will map to the same underlying file transfer. 

1358 to_transfer: Dict[ButlerURI, ButlerURI] = {} 

1359 

1360 for ref in refs: 

1361 locations = self._get_dataset_locations_info(ref) 

1362 for location, _ in locations: 

1363 source_uri = location.uri 

1364 target_path: Union[str, ButlerURI] 

1365 if preserve_path: 

1366 target_path = location.pathInStore 

1367 if target_path.isabs(): 1367 ↛ 1370line 1367 didn't jump to line 1370, because the condition on line 1367 was never true

1368 # This is an absolute path to an external file. 

1369 # Use the full path. 

1370 target_path = target_path.relativeToPathRoot 

1371 else: 

1372 target_path = source_uri.basename() 

1373 target_uri = destination.join(target_path) 

1374 to_transfer[source_uri] = target_uri 

1375 

1376 # In theory can now parallelize the transfer 

1377 log.debug("Number of artifacts to transfer to %s: %d", 

1378 str(destination), len(to_transfer)) 

1379 for source_uri, target_uri in to_transfer.items(): 

1380 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1381 

1382 return list(to_transfer.values()) 

1383 

1384 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1385 """Load an InMemoryDataset from the store. 

1386 

1387 Parameters 

1388 ---------- 

1389 ref : `DatasetRef` 

1390 Reference to the required Dataset. 

1391 parameters : `dict` 

1392 `StorageClass`-specific parameters that specify, for example, 

1393 a slice of the dataset to be loaded. 

1394 

1395 Returns 

1396 ------- 

1397 inMemoryDataset : `object` 

1398 Requested dataset or slice thereof as an InMemoryDataset. 

1399 

1400 Raises 

1401 ------ 

1402 FileNotFoundError 

1403 Requested dataset can not be retrieved. 

1404 TypeError 

1405 Return value from formatter has unexpected type. 

1406 ValueError 

1407 Formatter failed to process the dataset. 

1408 """ 

1409 allGetInfo = self._prepare_for_get(ref, parameters) 

1410 refComponent = ref.datasetType.component() 

1411 

1412 # Supplied storage class for the component being read 

1413 refStorageClass = ref.datasetType.storageClass 

1414 

1415 # Create mapping from component name to related info 

1416 allComponents = {i.component: i for i in allGetInfo} 

1417 

1418 # By definition the dataset is disassembled if we have more 

1419 # than one record for it. 

1420 isDisassembled = len(allGetInfo) > 1 

1421 

1422 # Look for the special case where we are disassembled but the 

1423 # component is a derived component that was not written during 

1424 # disassembly. For this scenario we need to check that the 

1425 # component requested is listed as a derived component for the 

1426 # composite storage class 

1427 isDisassembledReadOnlyComponent = False 

1428 if isDisassembled and refComponent: 

1429 # The composite storage class should be accessible through 

1430 # the component dataset type 

1431 compositeStorageClass = ref.datasetType.parentStorageClass 

1432 

1433 # In the unlikely scenario where the composite storage 

1434 # class is not known, we can only assume that this is a 

1435 # normal component. If that assumption is wrong then the 

1436 # branch below that reads a persisted component will fail 

1437 # so there is no need to complain here. 

1438 if compositeStorageClass is not None: 1438 ↛ 1441line 1438 didn't jump to line 1441, because the condition on line 1438 was never false

1439 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1440 

1441 if isDisassembled and not refComponent: 

1442 # This was a disassembled dataset spread over multiple files 

1443 # and we need to put them all back together again. 

1444 # Read into memory and then assemble 

1445 

1446 # Check that the supplied parameters are suitable for the type read 

1447 refStorageClass.validateParameters(parameters) 

1448 

1449 # We want to keep track of all the parameters that were not used 

1450 # by formatters. We assume that if any of the component formatters 

1451 # use a parameter that we do not need to apply it again in the 

1452 # assembler. 

1453 usedParams = set() 

1454 

1455 components: Dict[str, Any] = {} 

1456 for getInfo in allGetInfo: 

1457 # assemblerParams are parameters not understood by the 

1458 # associated formatter. 

1459 usedParams.update(set(getInfo.formatterParams)) 

1460 

1461 component = getInfo.component 

1462 

1463 if component is None: 1463 ↛ 1464line 1463 didn't jump to line 1464, because the condition on line 1463 was never true

1464 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1465 

1466 # We do not want the formatter to think it's reading 

1467 # a component though because it is really reading a 

1468 # standalone dataset -- always tell reader it is not a 

1469 # component. 

1470 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1471 

1472 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1473 

1474 # Any unused parameters will have to be passed to the assembler 

1475 if parameters: 

1476 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1477 else: 

1478 unusedParams = {} 

1479 

1480 # Process parameters 

1481 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1482 parameters=unusedParams) 

1483 

1484 elif isDisassembledReadOnlyComponent: 

1485 

1486 compositeStorageClass = ref.datasetType.parentStorageClass 

1487 if compositeStorageClass is None: 1487 ↛ 1488line 1487 didn't jump to line 1488, because the condition on line 1487 was never true

1488 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1489 "no composite storage class is available.") 

1490 

1491 if refComponent is None: 1491 ↛ 1493line 1491 didn't jump to line 1493, because the condition on line 1491 was never true

1492 # Mainly for mypy 

1493 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1494 

1495 # Assume that every derived component can be calculated by 

1496 # forwarding the request to a single read/write component. 

1497 # Rather than guessing which rw component is the right one by 

1498 # scanning each for a derived component of the same name, 

1499 # we ask the storage class delegate directly which one is best to 

1500 # use. 

1501 compositeDelegate = compositeStorageClass.delegate() 

1502 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1503 set(allComponents)) 

1504 

1505 # Select the relevant component 

1506 rwInfo = allComponents[forwardedComponent] 

1507 

1508 # For now assume that read parameters are validated against 

1509 # the real component and not the requested component 

1510 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1511 forwardedStorageClass.validateParameters(parameters) 

1512 

1513 # Unfortunately the FileDescriptor inside the formatter will have 

1514 # the wrong write storage class so we need to create a new one 

1515 # given the immutability constraint. 

1516 writeStorageClass = rwInfo.info.storageClass 

1517 

1518 # We may need to put some thought into parameters for read 

1519 # components but for now forward them on as is 

1520 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1521 readStorageClass=refStorageClass, 

1522 storageClass=writeStorageClass, 

1523 parameters=parameters), 

1524 ref.dataId) 

1525 

1526 # The assembler can not receive any parameter requests for a 

1527 # derived component at this time since the assembler will 

1528 # see the storage class of the derived component and those 

1529 # parameters will have to be handled by the formatter on the 

1530 # forwarded storage class. 

1531 assemblerParams: Dict[str, Any] = {} 

1532 

1533 # Need to created a new info that specifies the derived 

1534 # component and associated storage class 

1535 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1536 rwInfo.info, assemblerParams, {}, 

1537 refComponent, refStorageClass) 

1538 

1539 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1540 

1541 else: 

1542 # Single file request or component from that composite file 

1543 for lookup in (refComponent, None): 1543 ↛ 1548line 1543 didn't jump to line 1548, because the loop on line 1543 didn't complete

1544 if lookup in allComponents: 1544 ↛ 1543line 1544 didn't jump to line 1543, because the condition on line 1544 was never false

1545 getInfo = allComponents[lookup] 

1546 break 

1547 else: 

1548 raise FileNotFoundError(f"Component {refComponent} not found " 

1549 f"for ref {ref} in datastore {self.name}") 

1550 

1551 # Do not need the component itself if already disassembled 

1552 if isDisassembled: 

1553 isComponent = False 

1554 else: 

1555 isComponent = getInfo.component is not None 

1556 

1557 # For a disassembled component we can validate parametersagainst 

1558 # the component storage class directly 

1559 if isDisassembled: 

1560 refStorageClass.validateParameters(parameters) 

1561 else: 

1562 # For an assembled composite this could be a derived 

1563 # component derived from a real component. The validity 

1564 # of the parameters is not clear. For now validate against 

1565 # the composite storage class 

1566 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1567 

1568 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1569 

1570 @transactional 

1571 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1572 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1573 

1574 Parameters 

1575 ---------- 

1576 inMemoryDataset : `object` 

1577 The dataset to store. 

1578 ref : `DatasetRef` 

1579 Reference to the associated Dataset. 

1580 

1581 Raises 

1582 ------ 

1583 TypeError 

1584 Supplied object and storage class are inconsistent. 

1585 DatasetTypeNotSupportedError 

1586 The associated `DatasetType` is not handled by this datastore. 

1587 

1588 Notes 

1589 ----- 

1590 If the datastore is configured to reject certain dataset types it 

1591 is possible that the put will fail and raise a 

1592 `DatasetTypeNotSupportedError`. The main use case for this is to 

1593 allow `ChainedDatastore` to put to multiple datastores without 

1594 requiring that every datastore accepts the dataset. 

1595 """ 

1596 

1597 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1598 # doDisassembly = True 

1599 

1600 artifacts = [] 

1601 if doDisassembly: 

1602 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1603 for component, componentInfo in components.items(): 

1604 # Don't recurse because we want to take advantage of 

1605 # bulk insert -- need a new DatasetRef that refers to the 

1606 # same dataset_id but has the component DatasetType 

1607 # DatasetType does not refer to the types of components 

1608 # So we construct one ourselves. 

1609 compRef = ref.makeComponentRef(component) 

1610 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1611 artifacts.append((compRef, storedInfo)) 

1612 else: 

1613 # Write the entire thing out 

1614 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1615 artifacts.append((ref, storedInfo)) 

1616 

1617 self._register_datasets(artifacts) 

1618 

1619 @transactional 

1620 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

1621 # Get file metadata and internal metadata 

1622 if not isinstance(ref, DatasetRef): 

1623 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

1624 # Assumed to be an iterable of refs so bulk mode enabled. 

1625 try: 

1626 self.bridge.moveToTrash(ref) 

1627 except Exception as e: 

1628 if ignore_errors: 

1629 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

1630 else: 

1631 raise 

1632 return 

1633 

1634 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

1635 

1636 fileLocations = self._get_dataset_locations_info(ref) 

1637 

1638 if not fileLocations: 

1639 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1640 if ignore_errors: 1640 ↛ 1641line 1640 didn't jump to line 1641, because the condition on line 1640 was never true

1641 log.warning(err_msg) 

1642 return 

1643 else: 

1644 raise FileNotFoundError(err_msg) 

1645 

1646 for location, storedFileInfo in fileLocations: 

1647 if not self._artifact_exists(location): 1647 ↛ 1648line 1647 didn't jump to line 1648, because the condition on line 1647 was never true

1648 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1649 f"associated artifact ({location.uri}) is missing" 

1650 if ignore_errors: 

1651 log.warning(err_msg) 

1652 return 

1653 else: 

1654 raise FileNotFoundError(err_msg) 

1655 

1656 # Mark dataset as trashed 

1657 try: 

1658 self.bridge.moveToTrash([ref]) 

1659 except Exception as e: 

1660 if ignore_errors: 

1661 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1662 f"but encountered an error: {e}") 

1663 pass 

1664 else: 

1665 raise 

1666 

1667 @transactional 

1668 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1669 """Remove all datasets from the trash. 

1670 

1671 Parameters 

1672 ---------- 

1673 ignore_errors : `bool` 

1674 If `True` return without error even if something went wrong. 

1675 Problems could occur if another process is simultaneously trying 

1676 to delete. 

1677 """ 

1678 log.debug("Emptying trash in datastore %s", self.name) 

1679 

1680 # Context manager will empty trash iff we finish it without raising. 

1681 # It will also automatically delete the relevant rows from the 

1682 # trash table and the records table. 

1683 with self.bridge.emptyTrash(self._table, record_class=StoredFileInfo, 

1684 record_column="path") as trash_data: 

1685 # Removing the artifacts themselves requires that the files are 

1686 # not also associated with refs that are not to be trashed. 

1687 # Therefore need to do a query with the file paths themselves 

1688 # and return all the refs associated with them. Can only delete 

1689 # a file if the refs to be trashed are the only refs associated 

1690 # with the file. 

1691 # This requires multiple copies of the trashed items 

1692 trashed, artifacts_to_keep = trash_data 

1693 

1694 if artifacts_to_keep is None: 

1695 # The bridge is not helping us so have to work it out 

1696 # ourselves. This is not going to be as efficient. 

1697 trashed = list(trashed) 

1698 

1699 # The instance check is for mypy since up to this point it 

1700 # does not know the type of info. 

1701 path_map = self._refs_associated_with_artifacts([info.path for _, info in trashed 

1702 if isinstance(info, StoredFileInfo)]) 

1703 

1704 for ref, info in trashed: 

1705 

1706 # Mypy needs to know this is not the base class 

1707 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1708 

1709 # Check for mypy 

1710 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1711 

1712 path_map[info.path].remove(ref.id) 

1713 if not path_map[info.path]: 1713 ↛ 1704line 1713 didn't jump to line 1704, because the condition on line 1713 was never false

1714 del path_map[info.path] 

1715 

1716 artifacts_to_keep = set(path_map) 

1717 

1718 for ref, info in trashed: 

1719 

1720 # Should not happen for this implementation but need 

1721 # to keep mypy happy. 

1722 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

1723 

1724 # Mypy needs to know this is not the base class 

1725 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

1726 

1727 # Check for mypy 

1728 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

1729 

1730 if info.path in artifacts_to_keep: 

1731 # This is a multi-dataset artifact and we are not 

1732 # removing all associated refs. 

1733 continue 

1734 

1735 # Only trashed refs still known to datastore will be returned. 

1736 location = info.file_location(self.locationFactory) 

1737 

1738 # Point of no return for this artifact 

1739 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1740 try: 

1741 self._delete_artifact(location) 

1742 except FileNotFoundError: 

1743 # If the file itself has been deleted there is nothing 

1744 # we can do about it. It is possible that trash has 

1745 # been run in parallel in another process or someone 

1746 # decided to delete the file. It is unlikely to come 

1747 # back and so we should still continue with the removal 

1748 # of the entry from the trash table. It is also possible 

1749 # we removed it in a previous iteration if it was 

1750 # a multi-dataset artifact. The delete artifact method 

1751 # will log a debug message in this scenario. 

1752 # Distinguishing file missing before trash started and 

1753 # file already removed previously as part of this trash 

1754 # is not worth the distinction with regards to potential 

1755 # memory cost. 

1756 pass 

1757 except Exception as e: 

1758 if ignore_errors: 

1759 # Use a debug message here even though it's not 

1760 # a good situation. In some cases this can be 

1761 # caused by a race between user A and user B 

1762 # and neither of them has permissions for the 

1763 # other's files. Butler does not know about users 

1764 # and trash has no idea what collections these 

1765 # files were in (without guessing from a path). 

1766 log.debug("Encountered error removing artifact %s from datastore %s: %s", 

1767 location.uri, self.name, e) 

1768 else: 

1769 raise 

1770 

1771 @transactional 

1772 def transfer_from(self, source_datastore: Datastore, refs: Iterable[DatasetRef], 

1773 local_refs: Optional[Iterable[DatasetRef]] = None, 

1774 transfer: str = "auto") -> None: 

1775 # Docstring inherited 

1776 if type(self) is not type(source_datastore): 1776 ↛ 1777line 1776 didn't jump to line 1777, because the condition on line 1776 was never true

1777 raise TypeError(f"Datastore mismatch between this datastore ({type(self)}) and the " 

1778 f"source datastore ({type(source_datastore)}).") 

1779 

1780 # Be explicit for mypy 

1781 if not isinstance(source_datastore, FileDatastore): 1781 ↛ 1782line 1781 didn't jump to line 1782, because the condition on line 1781 was never true

1782 raise TypeError("Can only transfer to a FileDatastore from another FileDatastore, not" 

1783 f" {type(source_datastore)}") 

1784 

1785 # Stop early if "direct" transfer mode is requested. That would 

1786 # require that the URI inside the source datastore should be stored 

1787 # directly in the target datastore, which seems unlikely to be useful 

1788 # since at any moment the source datastore could delete the file. 

1789 if transfer == "direct": 1789 ↛ 1790line 1789 didn't jump to line 1790, because the condition on line 1789 was never true

1790 raise ValueError("Can not transfer from a source datastore using direct mode since" 

1791 " those files are controlled by the other datastore.") 

1792 

1793 # We will go through the list multiple times so must convert 

1794 # generators to lists. 

1795 refs = list(refs) 

1796 

1797 if local_refs is None: 1797 ↛ 1798line 1797 didn't jump to line 1798, because the condition on line 1797 was never true

1798 local_refs = refs 

1799 else: 

1800 local_refs = list(local_refs) 

1801 

1802 # In order to handle disassembled composites the code works 

1803 # at the records level since it can assume that internal APIs 

1804 # can be used. 

1805 # - If the record already exists in the destination this is assumed 

1806 # to be okay. 

1807 # - If there is no record but the source and destination URIs are 

1808 # identical no transfer is done but the record is added. 

1809 # - If the source record refers to an absolute URI currently assume 

1810 # that that URI should remain absolute and will be visible to the 

1811 # destination butler. May need to have a flag to indicate whether 

1812 # the dataset should be transferred. This will only happen if 

1813 # the detached Butler has had a local ingest. 

1814 

1815 # What we really want is all the records in the source datastore 

1816 # associated with these refs. Or derived ones if they don't exist 

1817 # in the source. 

1818 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

1819 

1820 # The source dataset_ids are the keys in these records 

1821 source_ids = set(source_records) 

1822 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

1823 

1824 # The not None check is to appease mypy 

1825 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

1826 missing_ids = requested_ids - source_ids 

1827 

1828 # Missing IDs can be okay if that datastore has allowed 

1829 # gets based on file existence. Should we transfer what we can 

1830 # or complain about it and warn? 

1831 if missing_ids and not source_datastore.trustGetRequest: 1831 ↛ 1832line 1831 didn't jump to line 1832, because the condition on line 1831 was never true

1832 raise ValueError(f"Some datasets are missing from source datastore {source_datastore}:" 

1833 f" {missing_ids}") 

1834 

1835 # Need to map these missing IDs to a DatasetRef so we can guess 

1836 # the details. 

1837 if missing_ids: 1837 ↛ 1838line 1837 didn't jump to line 1838, because the condition on line 1837 was never true

1838 log.info("Number of expected datasets missing from source datastore records: %d", 

1839 len(missing_ids)) 

1840 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

1841 

1842 for missing in missing_ids: 

1843 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1844 source_records[missing].extend(info for _, info in expected) 

1845 

1846 # See if we already have these records 

1847 target_records = self._get_stored_records_associated_with_refs(local_refs) 

1848 

1849 # The artifacts to register 

1850 artifacts = [] 

1851 

1852 # Refs that already exist 

1853 already_present = [] 

1854 

1855 # Now can transfer the artifacts 

1856 for source_ref, target_ref in zip(refs, local_refs): 

1857 if target_ref.id in target_records: 1857 ↛ 1859line 1857 didn't jump to line 1859, because the condition on line 1857 was never true

1858 # Already have an artifact for this. 

1859 already_present.append(target_ref) 

1860 continue 

1861 

1862 # mypy needs to know these are always resolved refs 

1863 for info in source_records[source_ref.getCheckedId()]: 

1864 source_location = info.file_location(source_datastore.locationFactory) 

1865 target_location = info.file_location(self.locationFactory) 

1866 if source_location == target_location: 1866 ↛ 1870line 1866 didn't jump to line 1870, because the condition on line 1866 was never true

1867 # Either the dataset is already in the target datastore 

1868 # (which is how execution butler currently runs) or 

1869 # it is an absolute URI. 

1870 if source_location.pathInStore.isabs(): 

1871 # Just because we can see the artifact when running 

1872 # the transfer doesn't mean it will be generally 

1873 # accessible to a user of this butler. For now warn 

1874 # but assume it will be accessible. 

1875 log.warning("Transfer request for an outside-datastore artifact has been found at %s", 

1876 source_location) 

1877 else: 

1878 # Need to transfer it to the new location. 

1879 # Assume we should always overwrite. If the artifact 

1880 # is there this might indicate that a previous transfer 

1881 # was interrupted but was not able to be rolled back 

1882 # completely (eg pre-emption) so follow Datastore default 

1883 # and overwrite. 

1884 target_location.uri.transfer_from(source_location.uri, transfer=transfer, 

1885 overwrite=True, transaction=self._transaction) 

1886 

1887 artifacts.append((target_ref, info)) 

1888 

1889 self._register_datasets(artifacts) 

1890 

1891 if already_present: 1891 ↛ 1892line 1891 didn't jump to line 1892, because the condition on line 1891 was never true

1892 n_skipped = len(already_present) 

1893 log.info("Skipped transfer of %d dataset%s already present in datastore", n_skipped, 

1894 "" if n_skipped == 1 else "s") 

1895 

1896 @transactional 

1897 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1898 # Docstring inherited. 

1899 refs = list(refs) 

1900 self.bridge.forget(refs) 

1901 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1902 

1903 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1904 logFailures: bool = False) -> None: 

1905 """Validate some of the configuration for this datastore. 

1906 

1907 Parameters 

1908 ---------- 

1909 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1910 Entities to test against this configuration. Can be differing 

1911 types. 

1912 logFailures : `bool`, optional 

1913 If `True`, output a log message for every validation error 

1914 detected. 

1915 

1916 Raises 

1917 ------ 

1918 DatastoreValidationError 

1919 Raised if there is a validation problem with a configuration. 

1920 All the problems are reported in a single exception. 

1921 

1922 Notes 

1923 ----- 

1924 This method checks that all the supplied entities have valid file 

1925 templates and also have formatters defined. 

1926 """ 

1927 

1928 templateFailed = None 

1929 try: 

1930 self.templates.validateTemplates(entities, logFailures=logFailures) 

1931 except FileTemplateValidationError as e: 

1932 templateFailed = str(e) 

1933 

1934 formatterFailed = [] 

1935 for entity in entities: 

1936 try: 

1937 self.formatterFactory.getFormatterClass(entity) 

1938 except KeyError as e: 

1939 formatterFailed.append(str(e)) 

1940 if logFailures: 1940 ↛ 1935line 1940 didn't jump to line 1935, because the condition on line 1940 was never false

1941 log.critical("Formatter failure: %s", e) 

1942 

1943 if templateFailed or formatterFailed: 

1944 messages = [] 

1945 if templateFailed: 1945 ↛ 1946line 1945 didn't jump to line 1946, because the condition on line 1945 was never true

1946 messages.append(templateFailed) 

1947 if formatterFailed: 1947 ↛ 1949line 1947 didn't jump to line 1949, because the condition on line 1947 was never false

1948 messages.append(",".join(formatterFailed)) 

1949 msg = ";\n".join(messages) 

1950 raise DatastoreValidationError(msg) 

1951 

1952 def getLookupKeys(self) -> Set[LookupKey]: 

1953 # Docstring is inherited from base class 

1954 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1955 self.constraints.getLookupKeys() 

1956 

1957 def validateKey(self, lookupKey: LookupKey, 

1958 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1959 # Docstring is inherited from base class 

1960 # The key can be valid in either formatters or templates so we can 

1961 # only check the template if it exists 

1962 if lookupKey in self.templates: 

1963 try: 

1964 self.templates[lookupKey].validateTemplate(entity) 

1965 except FileTemplateValidationError as e: 

1966 raise DatastoreValidationError(e) from e 

1967 

1968 def export(self, refs: Iterable[DatasetRef], *, 

1969 directory: Optional[Union[ButlerURI, str]] = None, 

1970 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1971 # Docstring inherited from Datastore.export. 

1972 if transfer is not None and directory is None: 1972 ↛ 1973line 1972 didn't jump to line 1973, because the condition on line 1972 was never true

1973 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1974 "export directory given") 

1975 

1976 # Force the directory to be a URI object 

1977 directoryUri: Optional[ButlerURI] = None 

1978 if directory is not None: 1978 ↛ 1981line 1978 didn't jump to line 1981, because the condition on line 1978 was never false

1979 directoryUri = ButlerURI(directory, forceDirectory=True) 

1980 

1981 if transfer is not None and directoryUri is not None: 1981 ↛ 1986line 1981 didn't jump to line 1986, because the condition on line 1981 was never false

1982 # mypy needs the second test 

1983 if not directoryUri.exists(): 1983 ↛ 1984line 1983 didn't jump to line 1984, because the condition on line 1983 was never true

1984 raise FileNotFoundError(f"Export location {directory} does not exist") 

1985 

1986 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

1987 for ref in progress.wrap(refs, "Exporting dataset files"): 

1988 fileLocations = self._get_dataset_locations_info(ref) 

1989 if not fileLocations: 1989 ↛ 1990line 1989 didn't jump to line 1990, because the condition on line 1989 was never true

1990 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1991 # For now we can not export disassembled datasets 

1992 if len(fileLocations) > 1: 1992 ↛ 1993line 1992 didn't jump to line 1993, because the condition on line 1992 was never true

1993 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1994 location, storedFileInfo = fileLocations[0] 

1995 

1996 pathInStore = location.pathInStore.path 

1997 if transfer is None: 1997 ↛ 2000line 1997 didn't jump to line 2000, because the condition on line 1997 was never true

1998 # TODO: do we also need to return the readStorageClass somehow? 

1999 # We will use the path in store directly 

2000 pass 

2001 elif transfer == "direct": 2001 ↛ 2003line 2001 didn't jump to line 2003, because the condition on line 2001 was never true

2002 # Use full URIs to the remote store in the export 

2003 pathInStore = str(location.uri) 

2004 else: 

2005 # mypy needs help 

2006 assert directoryUri is not None, "directoryUri must be defined to get here" 

2007 storeUri = ButlerURI(location.uri) 

2008 

2009 # if the datastore has an absolute URI to a resource, we 

2010 # have two options: 

2011 # 1. Keep the absolute URI in the exported YAML 

2012 # 2. Allocate a new name in the local datastore and transfer 

2013 # it. 

2014 # For now go with option 2 

2015 if location.pathInStore.isabs(): 2015 ↛ 2016line 2015 didn't jump to line 2016, because the condition on line 2015 was never true

2016 template = self.templates.getTemplate(ref) 

2017 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

2018 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2019 

2020 exportUri = directoryUri.join(pathInStore) 

2021 exportUri.transfer_from(storeUri, transfer=transfer) 

2022 

2023 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2024 

2025 @staticmethod 

2026 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

2027 """Compute the checksum of the supplied file. 

2028 

2029 Parameters 

2030 ---------- 

2031 uri : `ButlerURI` 

2032 Name of resource to calculate checksum from. 

2033 algorithm : `str`, optional 

2034 Name of algorithm to use. Must be one of the algorithms supported 

2035 by :py:class`hashlib`. 

2036 block_size : `int` 

2037 Number of bytes to read from file at one time. 

2038 

2039 Returns 

2040 ------- 

2041 hexdigest : `str` 

2042 Hex digest of the file. 

2043 

2044 Notes 

2045 ----- 

2046 Currently returns None if the URI is for a remote resource. 

2047 """ 

2048 if algorithm not in hashlib.algorithms_guaranteed: 2048 ↛ 2049line 2048 didn't jump to line 2049, because the condition on line 2048 was never true

2049 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2050 

2051 if not uri.isLocal: 2051 ↛ 2052line 2051 didn't jump to line 2052, because the condition on line 2051 was never true

2052 return None 

2053 

2054 hasher = hashlib.new(algorithm) 

2055 

2056 with uri.as_local() as local_uri: 

2057 with open(local_uri.ospath, "rb") as f: 

2058 for chunk in iter(lambda: f.read(block_size), b""): 

2059 hasher.update(chunk) 

2060 

2061 return hasher.hexdigest() 

2062 

2063 def needs_expanded_data_ids( 

2064 self, 

2065 transfer: Optional[str], 

2066 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2067 ) -> bool: 

2068 # Docstring inherited. 

2069 # This _could_ also use entity to inspect whether the filename template 

2070 # involves placeholders other than the required dimensions for its 

2071 # dataset type, but that's not necessary for correctness; it just 

2072 # enables more optimizations (perhaps only in theory). 

2073 return transfer not in ("direct", None)