Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 84%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

883 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Type, 

43 Union, 

44) 

45 

46from lsst.daf.butler import ( 

47 CompositesMap, 

48 Config, 

49 DatasetId, 

50 DatasetRef, 

51 DatasetType, 

52 DatasetTypeNotSupportedError, 

53 Datastore, 

54 DatastoreCacheManager, 

55 DatastoreConfig, 

56 DatastoreDisabledCacheManager, 

57 DatastoreRecordData, 

58 DatastoreValidationError, 

59 FileDataset, 

60 FileDescriptor, 

61 FileTemplates, 

62 FileTemplateValidationError, 

63 Formatter, 

64 FormatterFactory, 

65 Location, 

66 LocationFactory, 

67 Progress, 

68 StorageClass, 

69 StoredDatastoreItemInfo, 

70 StoredFileInfo, 

71 ddl, 

72) 

73from lsst.daf.butler.core.repoRelocation import replaceRoot 

74from lsst.daf.butler.core.utils import transactional 

75from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

76from lsst.resources import ResourcePath, ResourcePathExpression 

77from lsst.utils.introspection import get_class_of, get_instance_of 

78from lsst.utils.iteration import chunk_iterable 

79 

80# For VERBOSE logging usage. 

81from lsst.utils.logging import VERBOSE, getLogger 

82from lsst.utils.timer import time_this 

83from sqlalchemy import BigInteger, String 

84 

85from .genericDatastore import GenericBaseDatastore 

86 

87if TYPE_CHECKING: 87 ↛ 88line 87 didn't jump to line 88, because the condition on line 87 was never true

88 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

89 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

90 

91log = getLogger(__name__) 

92 

93 

94class _IngestPrepData(Datastore.IngestPrepData): 

95 """Helper class for FileDatastore ingest implementation. 

96 

97 Parameters 

98 ---------- 

99 datasets : `list` of `FileDataset` 

100 Files to be ingested by this datastore. 

101 """ 

102 

103 def __init__(self, datasets: List[FileDataset]): 

104 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

105 self.datasets = datasets 

106 

107 

108@dataclass(frozen=True) 

109class DatastoreFileGetInformation: 

110 """Collection of useful parameters needed to retrieve a file from 

111 a Datastore. 

112 """ 

113 

114 location: Location 

115 """The location from which to read the dataset.""" 

116 

117 formatter: Formatter 

118 """The `Formatter` to use to deserialize the dataset.""" 

119 

120 info: StoredFileInfo 

121 """Stored information about this file and its formatter.""" 

122 

123 assemblerParams: Mapping[str, Any] 

124 """Parameters to use for post-processing the retrieved dataset.""" 

125 

126 formatterParams: Mapping[str, Any] 

127 """Parameters that were understood by the associated formatter.""" 

128 

129 component: Optional[str] 

130 """The component to be retrieved (can be `None`).""" 

131 

132 readStorageClass: StorageClass 

133 """The `StorageClass` of the dataset being read.""" 

134 

135 

136class FileDatastore(GenericBaseDatastore): 

137 """Generic Datastore for file-based implementations. 

138 

139 Should always be sub-classed since key abstract methods are missing. 

140 

141 Parameters 

142 ---------- 

143 config : `DatastoreConfig` or `str` 

144 Configuration as either a `Config` object or URI to file. 

145 bridgeManager : `DatastoreRegistryBridgeManager` 

146 Object that manages the interface between `Registry` and datastores. 

147 butlerRoot : `str`, optional 

148 New datastore root to use to override the configuration value. 

149 

150 Raises 

151 ------ 

152 ValueError 

153 If root location does not exist and ``create`` is `False` in the 

154 configuration. 

155 """ 

156 

157 defaultConfigFile: ClassVar[Optional[str]] = None 

158 """Path to configuration defaults. Accessed within the ``config`` resource 

159 or relative to a search path. Can be None if no defaults specified. 

160 """ 

161 

162 root: ResourcePath 

163 """Root directory URI of this `Datastore`.""" 

164 

165 locationFactory: LocationFactory 

166 """Factory for creating locations relative to the datastore root.""" 

167 

168 formatterFactory: FormatterFactory 

169 """Factory for creating instances of formatters.""" 

170 

171 templates: FileTemplates 

172 """File templates that can be used by this `Datastore`.""" 

173 

174 composites: CompositesMap 

175 """Determines whether a dataset should be disassembled on put.""" 

176 

177 defaultConfigFile = "datastores/fileDatastore.yaml" 

178 """Path to configuration defaults. Accessed within the ``config`` resource 

179 or relative to a search path. Can be None if no defaults specified. 

180 """ 

181 

182 @classmethod 

183 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

184 """Set any filesystem-dependent config options for this Datastore to 

185 be appropriate for a new empty repository with the given root. 

186 

187 Parameters 

188 ---------- 

189 root : `str` 

190 URI to the root of the data repository. 

191 config : `Config` 

192 A `Config` to update. Only the subset understood by 

193 this component will be updated. Will not expand 

194 defaults. 

195 full : `Config` 

196 A complete config with all defaults expanded that can be 

197 converted to a `DatastoreConfig`. Read-only and will not be 

198 modified by this method. 

199 Repository-specific options that should not be obtained 

200 from defaults when Butler instances are constructed 

201 should be copied from ``full`` to ``config``. 

202 overwrite : `bool`, optional 

203 If `False`, do not modify a value in ``config`` if the value 

204 already exists. Default is always to overwrite with the provided 

205 ``root``. 

206 

207 Notes 

208 ----- 

209 If a keyword is explicitly defined in the supplied ``config`` it 

210 will not be overridden by this method if ``overwrite`` is `False`. 

211 This allows explicit values set in external configs to be retained. 

212 """ 

213 Config.updateParameters( 

214 DatastoreConfig, 

215 config, 

216 full, 

217 toUpdate={"root": root}, 

218 toCopy=("cls", ("records", "table")), 

219 overwrite=overwrite, 

220 ) 

221 

222 @classmethod 

223 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

224 return ddl.TableSpec( 

225 fields=[ 

226 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

227 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

228 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

229 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

230 # Use empty string to indicate no component 

231 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

232 # TODO: should checksum be Base64Bytes instead? 

233 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

234 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

235 ], 

236 unique=frozenset(), 

237 indexes=[tuple(["path"])], 

238 ) 

239 

240 def __init__( 

241 self, 

242 config: Union[DatastoreConfig, str], 

243 bridgeManager: DatastoreRegistryBridgeManager, 

244 butlerRoot: str = None, 

245 ): 

246 super().__init__(config, bridgeManager) 

247 if "root" not in self.config: 247 ↛ 248line 247 didn't jump to line 248, because the condition on line 247 was never true

248 raise ValueError("No root directory specified in configuration") 

249 

250 self._bridgeManager = bridgeManager 

251 

252 # Name ourselves either using an explicit name or a name 

253 # derived from the (unexpanded) root 

254 if "name" in self.config: 

255 self.name = self.config["name"] 

256 else: 

257 # We use the unexpanded root in the name to indicate that this 

258 # datastore can be moved without having to update registry. 

259 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

260 

261 # Support repository relocation in config 

262 # Existence of self.root is checked in subclass 

263 self.root = ResourcePath( 

264 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

265 ) 

266 

267 self.locationFactory = LocationFactory(self.root) 

268 self.formatterFactory = FormatterFactory() 

269 

270 # Now associate formatters with storage classes 

271 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

272 

273 # Read the file naming templates 

274 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

275 

276 # See if composites should be disassembled 

277 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

278 

279 tableName = self.config["records", "table"] 

280 try: 

281 # Storage of paths and formatters, keyed by dataset_id 

282 self._table = bridgeManager.opaque.register( 

283 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

284 ) 

285 # Interface to Registry. 

286 self._bridge = bridgeManager.register(self.name) 

287 except ReadOnlyDatabaseError: 

288 # If the database is read only and we just tried and failed to 

289 # create a table, it means someone is trying to create a read-only 

290 # butler client for an empty repo. That should be okay, as long 

291 # as they then try to get any datasets before some other client 

292 # creates the table. Chances are they'rejust validating 

293 # configuration. 

294 pass 

295 

296 # Determine whether checksums should be used - default to False 

297 self.useChecksum = self.config.get("checksum", False) 

298 

299 # Determine whether we can fall back to configuration if a 

300 # requested dataset is not known to registry 

301 self.trustGetRequest = self.config.get("trust_get_request", False) 

302 

303 # Create a cache manager 

304 self.cacheManager: AbstractDatastoreCacheManager 

305 if "cached" in self.config: 305 ↛ 308line 305 didn't jump to line 308, because the condition on line 305 was never false

306 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

307 else: 

308 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

309 

310 # Check existence and create directory structure if necessary 

311 if not self.root.exists(): 

312 if "create" not in self.config or not self.config["create"]: 312 ↛ 313line 312 didn't jump to line 313, because the condition on line 312 was never true

313 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

314 try: 

315 self.root.mkdir() 

316 except Exception as e: 

317 raise ValueError( 

318 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

319 ) from e 

320 

321 def __str__(self) -> str: 

322 return str(self.root) 

323 

324 @property 

325 def bridge(self) -> DatastoreRegistryBridge: 

326 return self._bridge 

327 

328 def _artifact_exists(self, location: Location) -> bool: 

329 """Check that an artifact exists in this datastore at the specified 

330 location. 

331 

332 Parameters 

333 ---------- 

334 location : `Location` 

335 Expected location of the artifact associated with this datastore. 

336 

337 Returns 

338 ------- 

339 exists : `bool` 

340 True if the location can be found, false otherwise. 

341 """ 

342 log.debug("Checking if resource exists: %s", location.uri) 

343 return location.uri.exists() 

344 

345 def _delete_artifact(self, location: Location) -> None: 

346 """Delete the artifact from the datastore. 

347 

348 Parameters 

349 ---------- 

350 location : `Location` 

351 Location of the artifact associated with this datastore. 

352 """ 

353 if location.pathInStore.isabs(): 353 ↛ 354line 353 didn't jump to line 354, because the condition on line 353 was never true

354 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

355 

356 try: 

357 location.uri.remove() 

358 except FileNotFoundError: 

359 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

360 raise 

361 except Exception as e: 

362 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

363 raise 

364 log.debug("Successfully deleted file: %s", location.uri) 

365 

366 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

367 # Docstring inherited from GenericBaseDatastore 

368 records = [info.rebase(ref).to_record() for ref, info in zip(refs, infos)] 

369 self._table.insert(*records) 

370 

371 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

372 # Docstring inherited from GenericBaseDatastore 

373 

374 # Look for the dataset_id -- there might be multiple matches 

375 # if we have disassembled the dataset. 

376 records = self._table.fetch(dataset_id=ref.id) 

377 return [StoredFileInfo.from_record(record) for record in records] 

378 

379 def _get_stored_records_associated_with_refs( 

380 self, refs: Iterable[DatasetIdRef] 

381 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

382 """Retrieve all records associated with the provided refs. 

383 

384 Parameters 

385 ---------- 

386 refs : iterable of `DatasetIdRef` 

387 The refs for which records are to be retrieved. 

388 

389 Returns 

390 ------- 

391 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

392 The matching records indexed by the ref ID. The number of entries 

393 in the dict can be smaller than the number of requested refs. 

394 """ 

395 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

396 

397 # Uniqueness is dataset_id + component so can have multiple records 

398 # per ref. 

399 records_by_ref = defaultdict(list) 

400 for record in records: 

401 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

402 return records_by_ref 

403 

404 def _refs_associated_with_artifacts( 

405 self, paths: List[Union[str, ResourcePath]] 

406 ) -> Dict[str, Set[DatasetId]]: 

407 """Return paths and associated dataset refs. 

408 

409 Parameters 

410 ---------- 

411 paths : `list` of `str` or `lsst.resources.ResourcePath` 

412 All the paths to include in search. 

413 

414 Returns 

415 ------- 

416 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

417 Mapping of each path to a set of associated database IDs. 

418 """ 

419 records = self._table.fetch(path=[str(path) for path in paths]) 

420 result = defaultdict(set) 

421 for row in records: 

422 result[row["path"]].add(row["dataset_id"]) 

423 return result 

424 

425 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

426 """Return all dataset refs associated with the supplied path. 

427 

428 Parameters 

429 ---------- 

430 pathInStore : `lsst.resources.ResourcePath` 

431 Path of interest in the data store. 

432 

433 Returns 

434 ------- 

435 ids : `set` of `int` 

436 All `DatasetRef` IDs associated with this path. 

437 """ 

438 records = list(self._table.fetch(path=str(pathInStore))) 

439 ids = {r["dataset_id"] for r in records} 

440 return ids 

441 

442 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

443 # Docstring inherited from GenericBaseDatastore 

444 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

445 

446 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

447 r"""Find all the `Location`\ s of the requested dataset in the 

448 `Datastore` and the associated stored file information. 

449 

450 Parameters 

451 ---------- 

452 ref : `DatasetRef` 

453 Reference to the required `Dataset`. 

454 

455 Returns 

456 ------- 

457 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

458 Location of the dataset within the datastore and 

459 stored information about each file and its formatter. 

460 """ 

461 # Get the file information (this will fail if no file) 

462 records = self.getStoredItemsInfo(ref) 

463 

464 # Use the path to determine the location -- we need to take 

465 # into account absolute URIs in the datastore record 

466 return [(r.file_location(self.locationFactory), r) for r in records] 

467 

468 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

469 """Check that there is only one dataset associated with the 

470 specified artifact. 

471 

472 Parameters 

473 ---------- 

474 ref : `DatasetRef` or `FakeDatasetRef` 

475 Dataset to be removed. 

476 location : `Location` 

477 The location of the artifact to be removed. 

478 

479 Returns 

480 ------- 

481 can_remove : `Bool` 

482 True if the artifact can be safely removed. 

483 """ 

484 # Can't ever delete absolute URIs. 

485 if location.pathInStore.isabs(): 

486 return False 

487 

488 # Get all entries associated with this path 

489 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

490 if not allRefs: 

491 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

492 

493 # Remove these refs from all the refs and if there is nothing left 

494 # then we can delete 

495 remainingRefs = allRefs - {ref.id} 

496 

497 if remainingRefs: 

498 return False 

499 return True 

500 

501 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

502 """Predict the location and related file information of the requested 

503 dataset in this datastore. 

504 

505 Parameters 

506 ---------- 

507 ref : `DatasetRef` 

508 Reference to the required `Dataset`. 

509 

510 Returns 

511 ------- 

512 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

513 Expected Location of the dataset within the datastore and 

514 placeholder information about each file and its formatter. 

515 

516 Notes 

517 ----- 

518 Uses the current configuration to determine how we would expect the 

519 datastore files to have been written if we couldn't ask registry. 

520 This is safe so long as there has been no change to datastore 

521 configuration between writing the dataset and wanting to read it. 

522 Will not work for files that have been ingested without using the 

523 standard file template or default formatter. 

524 """ 

525 

526 # If we have a component ref we always need to ask the questions 

527 # of the composite. If the composite is disassembled this routine 

528 # should return all components. If the composite was not 

529 # disassembled the composite is what is stored regardless of 

530 # component request. Note that if the caller has disassembled 

531 # a composite there is no way for this guess to know that 

532 # without trying both the composite and component ref and seeing 

533 # if there is something at the component Location even without 

534 # disassembly being enabled. 

535 if ref.datasetType.isComponent(): 

536 ref = ref.makeCompositeRef() 

537 

538 # See if the ref is a composite that should be disassembled 

539 doDisassembly = self.composites.shouldBeDisassembled(ref) 

540 

541 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

542 

543 if doDisassembly: 

544 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

545 compRef = ref.makeComponentRef(component) 

546 location, formatter = self._determine_put_formatter_location(compRef) 

547 all_info.append((location, formatter, componentStorage, component)) 

548 

549 else: 

550 # Always use the composite ref if no disassembly 

551 location, formatter = self._determine_put_formatter_location(ref) 

552 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

553 

554 # Convert the list of tuples to have StoredFileInfo as second element 

555 return [ 

556 ( 

557 location, 

558 StoredFileInfo( 

559 formatter=formatter, 

560 path=location.pathInStore.path, 

561 storageClass=storageClass, 

562 component=component, 

563 checksum=None, 

564 file_size=-1, 

565 dataset_id=ref.getCheckedId(), 

566 ), 

567 ) 

568 for location, formatter, storageClass, component in all_info 

569 ] 

570 

571 def _prepare_for_get( 

572 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

573 ) -> List[DatastoreFileGetInformation]: 

574 """Check parameters for ``get`` and obtain formatter and 

575 location. 

576 

577 Parameters 

578 ---------- 

579 ref : `DatasetRef` 

580 Reference to the required Dataset. 

581 parameters : `dict` 

582 `StorageClass`-specific parameters that specify, for example, 

583 a slice of the dataset to be loaded. 

584 

585 Returns 

586 ------- 

587 getInfo : `list` [`DatastoreFileGetInformation`] 

588 Parameters needed to retrieve each file. 

589 """ 

590 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

591 

592 # Get file metadata and internal metadata 

593 fileLocations = self._get_dataset_locations_info(ref) 

594 if not fileLocations: 

595 if not self.trustGetRequest: 

596 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

597 # Assume the dataset is where we think it should be 

598 fileLocations = self._get_expected_dataset_locations_info(ref) 

599 

600 # The storage class we want to use eventually 

601 refStorageClass = ref.datasetType.storageClass 

602 

603 if len(fileLocations) > 1: 

604 disassembled = True 

605 

606 # If trust is involved it is possible that there will be 

607 # components listed here that do not exist in the datastore. 

608 # Explicitly check for file artifact existence and filter out any 

609 # that are missing. 

610 if self.trustGetRequest: 

611 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

612 

613 # For now complain only if we have no components at all. One 

614 # component is probably a problem but we can punt that to the 

615 # assembler. 

616 if not fileLocations: 616 ↛ 617line 616 didn't jump to line 617, because the condition on line 616 was never true

617 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

618 

619 else: 

620 disassembled = False 

621 

622 # Is this a component request? 

623 refComponent = ref.datasetType.component() 

624 

625 fileGetInfo = [] 

626 for location, storedFileInfo in fileLocations: 

627 

628 # The storage class used to write the file 

629 writeStorageClass = storedFileInfo.storageClass 

630 

631 # If this has been disassembled we need read to match the write 

632 if disassembled: 

633 readStorageClass = writeStorageClass 

634 else: 

635 readStorageClass = refStorageClass 

636 

637 formatter = get_instance_of( 

638 storedFileInfo.formatter, 

639 FileDescriptor( 

640 location, 

641 readStorageClass=readStorageClass, 

642 storageClass=writeStorageClass, 

643 parameters=parameters, 

644 ), 

645 ref.dataId, 

646 ) 

647 

648 formatterParams, notFormatterParams = formatter.segregateParameters() 

649 

650 # Of the remaining parameters, extract the ones supported by 

651 # this StorageClass (for components not all will be handled) 

652 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

653 

654 # The ref itself could be a component if the dataset was 

655 # disassembled by butler, or we disassembled in datastore and 

656 # components came from the datastore records 

657 component = storedFileInfo.component if storedFileInfo.component else refComponent 

658 

659 fileGetInfo.append( 

660 DatastoreFileGetInformation( 

661 location, 

662 formatter, 

663 storedFileInfo, 

664 assemblerParams, 

665 formatterParams, 

666 component, 

667 readStorageClass, 

668 ) 

669 ) 

670 

671 return fileGetInfo 

672 

673 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

674 """Check the arguments for ``put`` and obtain formatter and 

675 location. 

676 

677 Parameters 

678 ---------- 

679 inMemoryDataset : `object` 

680 The dataset to store. 

681 ref : `DatasetRef` 

682 Reference to the associated Dataset. 

683 

684 Returns 

685 ------- 

686 location : `Location` 

687 The location to write the dataset. 

688 formatter : `Formatter` 

689 The `Formatter` to use to write the dataset. 

690 

691 Raises 

692 ------ 

693 TypeError 

694 Supplied object and storage class are inconsistent. 

695 DatasetTypeNotSupportedError 

696 The associated `DatasetType` is not handled by this datastore. 

697 """ 

698 self._validate_put_parameters(inMemoryDataset, ref) 

699 return self._determine_put_formatter_location(ref) 

700 

701 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

702 """Calculate the formatter and output location to use for put. 

703 

704 Parameters 

705 ---------- 

706 ref : `DatasetRef` 

707 Reference to the associated Dataset. 

708 

709 Returns 

710 ------- 

711 location : `Location` 

712 The location to write the dataset. 

713 formatter : `Formatter` 

714 The `Formatter` to use to write the dataset. 

715 """ 

716 # Work out output file name 

717 try: 

718 template = self.templates.getTemplate(ref) 

719 except KeyError as e: 

720 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

721 

722 # Validate the template to protect against filenames from different 

723 # dataIds returning the same and causing overwrite confusion. 

724 template.validateTemplate(ref) 

725 

726 location = self.locationFactory.fromPath(template.format(ref)) 

727 

728 # Get the formatter based on the storage class 

729 storageClass = ref.datasetType.storageClass 

730 try: 

731 formatter = self.formatterFactory.getFormatter( 

732 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

733 ) 

734 except KeyError as e: 

735 raise DatasetTypeNotSupportedError( 

736 f"Unable to find formatter for {ref} in datastore {self.name}" 

737 ) from e 

738 

739 # Now that we know the formatter, update the location 

740 location = formatter.makeUpdatedLocation(location) 

741 

742 return location, formatter 

743 

744 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

745 # Docstring inherited from base class 

746 if transfer != "auto": 

747 return transfer 

748 

749 # See if the paths are within the datastore or not 

750 inside = [self._pathInStore(d.path) is not None for d in datasets] 

751 

752 if all(inside): 

753 transfer = None 

754 elif not any(inside): 754 ↛ 763line 754 didn't jump to line 763, because the condition on line 754 was never false

755 # Allow ResourcePath to use its own knowledge 

756 transfer = "auto" 

757 else: 

758 # This can happen when importing from a datastore that 

759 # has had some datasets ingested using "direct" mode. 

760 # Also allow ResourcePath to sort it out but warn about it. 

761 # This can happen if you are importing from a datastore 

762 # that had some direct transfer datasets. 

763 log.warning( 

764 "Some datasets are inside the datastore and some are outside. Using 'split' " 

765 "transfer mode. This assumes that the files outside the datastore are " 

766 "still accessible to the new butler since they will not be copied into " 

767 "the target datastore." 

768 ) 

769 transfer = "split" 

770 

771 return transfer 

772 

773 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

774 """Return path relative to datastore root 

775 

776 Parameters 

777 ---------- 

778 path : `lsst.resources.ResourcePathExpression` 

779 Path to dataset. Can be absolute URI. If relative assumed to 

780 be relative to the datastore. Returns path in datastore 

781 or raises an exception if the path it outside. 

782 

783 Returns 

784 ------- 

785 inStore : `str` 

786 Path relative to datastore root. Returns `None` if the file is 

787 outside the root. 

788 """ 

789 # Relative path will always be relative to datastore 

790 pathUri = ResourcePath(path, forceAbsolute=False) 

791 return pathUri.relative_to(self.root) 

792 

793 def _standardizeIngestPath( 

794 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None 

795 ) -> Union[str, ResourcePath]: 

796 """Standardize the path of a to-be-ingested file. 

797 

798 Parameters 

799 ---------- 

800 path : `str` or `lsst.resources.ResourcePath` 

801 Path of a file to be ingested. This parameter is not expected 

802 to be all the types that can be used to construct a 

803 `~lsst.resources.ResourcePath`. 

804 transfer : `str`, optional 

805 How (and whether) the dataset should be added to the datastore. 

806 See `ingest` for details of transfer modes. 

807 This implementation is provided only so 

808 `NotImplementedError` can be raised if the mode is not supported; 

809 actual transfers are deferred to `_extractIngestInfo`. 

810 

811 Returns 

812 ------- 

813 path : `str` or `lsst.resources.ResourcePath` 

814 New path in what the datastore considers standard form. If an 

815 absolute URI was given that will be returned unchanged. 

816 

817 Notes 

818 ----- 

819 Subclasses of `FileDatastore` can implement this method instead 

820 of `_prepIngest`. It should not modify the data repository or given 

821 file in any way. 

822 

823 Raises 

824 ------ 

825 NotImplementedError 

826 Raised if the datastore does not support the given transfer mode 

827 (including the case where ingest is not supported at all). 

828 FileNotFoundError 

829 Raised if one of the given files does not exist. 

830 """ 

831 if transfer not in (None, "direct", "split") + self.root.transferModes: 831 ↛ 832line 831 didn't jump to line 832, because the condition on line 831 was never true

832 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

833 

834 # A relative URI indicates relative to datastore root 

835 srcUri = ResourcePath(path, forceAbsolute=False) 

836 if not srcUri.isabs(): 

837 srcUri = self.root.join(path) 

838 

839 if not srcUri.exists(): 

840 raise FileNotFoundError( 

841 f"Resource at {srcUri} does not exist; note that paths to ingest " 

842 f"are assumed to be relative to {self.root} unless they are absolute." 

843 ) 

844 

845 if transfer is None: 

846 relpath = srcUri.relative_to(self.root) 

847 if not relpath: 

848 raise RuntimeError( 

849 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

850 ) 

851 

852 # Return the relative path within the datastore for internal 

853 # transfer 

854 path = relpath 

855 

856 return path 

857 

858 def _extractIngestInfo( 

859 self, 

860 path: ResourcePathExpression, 

861 ref: DatasetRef, 

862 *, 

863 formatter: Union[Formatter, Type[Formatter]], 

864 transfer: Optional[str] = None, 

865 record_validation_info: bool = True, 

866 ) -> StoredFileInfo: 

867 """Relocate (if necessary) and extract `StoredFileInfo` from a 

868 to-be-ingested file. 

869 

870 Parameters 

871 ---------- 

872 path : `lsst.resources.ResourcePathExpression` 

873 URI or path of a file to be ingested. 

874 ref : `DatasetRef` 

875 Reference for the dataset being ingested. Guaranteed to have 

876 ``dataset_id not None`. 

877 formatter : `type` or `Formatter` 

878 `Formatter` subclass to use for this dataset or an instance. 

879 transfer : `str`, optional 

880 How (and whether) the dataset should be added to the datastore. 

881 See `ingest` for details of transfer modes. 

882 record_validation_info : `bool`, optional 

883 If `True`, the default, the datastore can record validation 

884 information associated with the file. If `False` the datastore 

885 will not attempt to track any information such as checksums 

886 or file sizes. This can be useful if such information is tracked 

887 in an external system or if the file is to be compressed in place. 

888 It is up to the datastore whether this parameter is relevant. 

889 

890 Returns 

891 ------- 

892 info : `StoredFileInfo` 

893 Internal datastore record for this file. This will be inserted by 

894 the caller; the `_extractIngestInfo` is only responsible for 

895 creating and populating the struct. 

896 

897 Raises 

898 ------ 

899 FileNotFoundError 

900 Raised if one of the given files does not exist. 

901 FileExistsError 

902 Raised if transfer is not `None` but the (internal) location the 

903 file would be moved to is already occupied. 

904 """ 

905 if self._transaction is None: 905 ↛ 906line 905 didn't jump to line 906, because the condition on line 905 was never true

906 raise RuntimeError("Ingest called without transaction enabled") 

907 

908 # Create URI of the source path, do not need to force a relative 

909 # path to absolute. 

910 srcUri = ResourcePath(path, forceAbsolute=False) 

911 

912 # Track whether we have read the size of the source yet 

913 have_sized = False 

914 

915 tgtLocation: Optional[Location] 

916 if transfer is None or transfer == "split": 

917 # A relative path is assumed to be relative to the datastore 

918 # in this context 

919 if not srcUri.isabs(): 

920 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

921 else: 

922 # Work out the path in the datastore from an absolute URI 

923 # This is required to be within the datastore. 

924 pathInStore = srcUri.relative_to(self.root) 

925 if pathInStore is None and transfer is None: 925 ↛ 926line 925 didn't jump to line 926, because the condition on line 925 was never true

926 raise RuntimeError( 

927 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

928 ) 

929 if pathInStore: 929 ↛ 931line 929 didn't jump to line 931, because the condition on line 929 was never false

930 tgtLocation = self.locationFactory.fromPath(pathInStore) 

931 elif transfer == "split": 

932 # Outside the datastore but treat that as a direct ingest 

933 # instead. 

934 tgtLocation = None 

935 else: 

936 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

937 elif transfer == "direct": 937 ↛ 942line 937 didn't jump to line 942, because the condition on line 937 was never true

938 # Want to store the full URI to the resource directly in 

939 # datastore. This is useful for referring to permanent archive 

940 # storage for raw data. 

941 # Trust that people know what they are doing. 

942 tgtLocation = None 

943 else: 

944 # Work out the name we want this ingested file to have 

945 # inside the datastore 

946 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

947 if not tgtLocation.uri.dirname().exists(): 

948 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

949 tgtLocation.uri.dirname().mkdir() 

950 

951 # if we are transferring from a local file to a remote location 

952 # it may be more efficient to get the size and checksum of the 

953 # local file rather than the transferred one 

954 if record_validation_info and srcUri.isLocal: 

955 size = srcUri.size() 

956 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

957 have_sized = True 

958 

959 # Transfer the resource to the destination. 

960 # Allow overwrite of an existing file. This matches the behavior 

961 # of datastore.put() in that it trusts that registry would not 

962 # be asking to overwrite unless registry thought that the 

963 # overwrite was allowed. 

964 tgtLocation.uri.transfer_from( 

965 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

966 ) 

967 

968 if tgtLocation is None: 968 ↛ 970line 968 didn't jump to line 970, because the condition on line 968 was never true

969 # This means we are using direct mode 

970 targetUri = srcUri 

971 targetPath = str(srcUri) 

972 else: 

973 targetUri = tgtLocation.uri 

974 targetPath = tgtLocation.pathInStore.path 

975 

976 # the file should exist in the datastore now 

977 if record_validation_info: 

978 if not have_sized: 

979 size = targetUri.size() 

980 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

981 else: 

982 # Not recording any file information. 

983 size = -1 

984 checksum = None 

985 

986 return StoredFileInfo( 

987 formatter=formatter, 

988 path=targetPath, 

989 storageClass=ref.datasetType.storageClass, 

990 component=ref.datasetType.component(), 

991 file_size=size, 

992 checksum=checksum, 

993 dataset_id=ref.getCheckedId(), 

994 ) 

995 

996 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

997 # Docstring inherited from Datastore._prepIngest. 

998 filtered = [] 

999 for dataset in datasets: 

1000 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

1001 if not acceptable: 

1002 continue 

1003 else: 

1004 dataset.refs = acceptable 

1005 if dataset.formatter is None: 

1006 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1007 else: 

1008 assert isinstance(dataset.formatter, (type, str)) 

1009 formatter_class = get_class_of(dataset.formatter) 

1010 if not issubclass(formatter_class, Formatter): 1010 ↛ 1011line 1010 didn't jump to line 1011, because the condition on line 1010 was never true

1011 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1012 dataset.formatter = formatter_class 

1013 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1014 filtered.append(dataset) 

1015 return _IngestPrepData(filtered) 

1016 

1017 @transactional 

1018 def _finishIngest( 

1019 self, 

1020 prepData: Datastore.IngestPrepData, 

1021 *, 

1022 transfer: Optional[str] = None, 

1023 record_validation_info: bool = True, 

1024 ) -> None: 

1025 # Docstring inherited from Datastore._finishIngest. 

1026 refsAndInfos = [] 

1027 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1028 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1029 # Do ingest as if the first dataset ref is associated with the file 

1030 info = self._extractIngestInfo( 

1031 dataset.path, 

1032 dataset.refs[0], 

1033 formatter=dataset.formatter, 

1034 transfer=transfer, 

1035 record_validation_info=record_validation_info, 

1036 ) 

1037 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1038 self._register_datasets(refsAndInfos) 

1039 

1040 def _calculate_ingested_datastore_name( 

1041 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1042 ) -> Location: 

1043 """Given a source URI and a DatasetRef, determine the name the 

1044 dataset will have inside datastore. 

1045 

1046 Parameters 

1047 ---------- 

1048 srcUri : `lsst.resources.ResourcePath` 

1049 URI to the source dataset file. 

1050 ref : `DatasetRef` 

1051 Ref associated with the newly-ingested dataset artifact. This 

1052 is used to determine the name within the datastore. 

1053 formatter : `Formatter` or Formatter class. 

1054 Formatter to use for validation. Can be a class or an instance. 

1055 

1056 Returns 

1057 ------- 

1058 location : `Location` 

1059 Target location for the newly-ingested dataset. 

1060 """ 

1061 # Ingesting a file from outside the datastore. 

1062 # This involves a new name. 

1063 template = self.templates.getTemplate(ref) 

1064 location = self.locationFactory.fromPath(template.format(ref)) 

1065 

1066 # Get the extension 

1067 ext = srcUri.getExtension() 

1068 

1069 # Update the destination to include that extension 

1070 location.updateExtension(ext) 

1071 

1072 # Ask the formatter to validate this extension 

1073 formatter.validateExtension(location) 

1074 

1075 return location 

1076 

1077 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1078 """Write out in memory dataset to datastore. 

1079 

1080 Parameters 

1081 ---------- 

1082 inMemoryDataset : `object` 

1083 Dataset to write to datastore. 

1084 ref : `DatasetRef` 

1085 Registry information associated with this dataset. 

1086 

1087 Returns 

1088 ------- 

1089 info : `StoredFileInfo` 

1090 Information describing the artifact written to the datastore. 

1091 """ 

1092 # May need to coerce the in memory dataset to the correct 

1093 # python type. 

1094 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1095 

1096 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1097 uri = location.uri 

1098 

1099 if not uri.dirname().exists(): 

1100 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1101 uri.dirname().mkdir() 

1102 

1103 if self._transaction is None: 1103 ↛ 1104line 1103 didn't jump to line 1104, because the condition on line 1103 was never true

1104 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1105 

1106 def _removeFileExists(uri: ResourcePath) -> None: 

1107 """Remove a file and do not complain if it is not there. 

1108 

1109 This is important since a formatter might fail before the file 

1110 is written and we should not confuse people by writing spurious 

1111 error messages to the log. 

1112 """ 

1113 try: 

1114 uri.remove() 

1115 except FileNotFoundError: 

1116 pass 

1117 

1118 # Register a callback to try to delete the uploaded data if 

1119 # something fails below 

1120 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1121 

1122 # For a local file, simply use the formatter directly 

1123 if uri.isLocal: 

1124 try: 

1125 formatter.write(inMemoryDataset) 

1126 except Exception as e: 

1127 raise RuntimeError( 

1128 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}" 

1129 ) from e 

1130 log.debug("Successfully wrote python object to local file at %s", uri) 

1131 else: 

1132 # This is a remote URI. Some datasets can be serialized directly 

1133 # to bytes and sent to the remote datastore without writing a 

1134 # file. If the dataset is intended to be saved to the cache 

1135 # a file is always written and direct write to the remote 

1136 # datastore is bypassed. 

1137 data_written = False 

1138 if not self.cacheManager.should_be_cached(ref): 

1139 try: 

1140 serializedDataset = formatter.toBytes(inMemoryDataset) 

1141 except NotImplementedError: 

1142 # Fallback to the file writing option. 

1143 pass 

1144 except Exception as e: 

1145 raise RuntimeError( 

1146 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1147 ) from e 

1148 else: 

1149 log.debug("Writing bytes directly to %s", uri) 

1150 uri.write(serializedDataset, overwrite=True) 

1151 log.debug("Successfully wrote bytes directly to %s", uri) 

1152 data_written = True 

1153 

1154 if not data_written: 

1155 # Did not write the bytes directly to object store so instead 

1156 # write to temporary file. 

1157 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1158 # Need to configure the formatter to write to a different 

1159 # location and that needs us to overwrite internals 

1160 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1161 with formatter._updateLocation(Location(None, temporary_uri)): 

1162 try: 

1163 formatter.write(inMemoryDataset) 

1164 except Exception as e: 

1165 raise RuntimeError( 

1166 f"Failed to serialize dataset {ref} of type" 

1167 f" {type(inMemoryDataset)} to " 

1168 f"temporary location {temporary_uri}" 

1169 ) from e 

1170 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1171 

1172 # Cache if required 

1173 self.cacheManager.move_to_cache(temporary_uri, ref) 

1174 

1175 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1176 

1177 # URI is needed to resolve what ingest case are we dealing with 

1178 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1179 

1180 def _read_artifact_into_memory( 

1181 self, 

1182 getInfo: DatastoreFileGetInformation, 

1183 ref: DatasetRef, 

1184 isComponent: bool = False, 

1185 cache_ref: Optional[DatasetRef] = None, 

1186 ) -> Any: 

1187 """Read the artifact from datastore into in memory object. 

1188 

1189 Parameters 

1190 ---------- 

1191 getInfo : `DatastoreFileGetInformation` 

1192 Information about the artifact within the datastore. 

1193 ref : `DatasetRef` 

1194 The registry information associated with this artifact. 

1195 isComponent : `bool` 

1196 Flag to indicate if a component is being read from this artifact. 

1197 cache_ref : `DatasetRef`, optional 

1198 The DatasetRef to use when looking up the file in the cache. 

1199 This ref must have the same ID as the supplied ref but can 

1200 be a parent ref or component ref to indicate to the cache whether 

1201 a composite file is being requested from the cache or a component 

1202 file. Without this the cache will default to the supplied ref but 

1203 it can get confused with read-only derived components for 

1204 disassembled composites. 

1205 

1206 Returns 

1207 ------- 

1208 inMemoryDataset : `object` 

1209 The artifact as a python object. 

1210 """ 

1211 location = getInfo.location 

1212 uri = location.uri 

1213 log.debug("Accessing data from %s", uri) 

1214 

1215 if cache_ref is None: 

1216 cache_ref = ref 

1217 if cache_ref.id != ref.id: 1217 ↛ 1218line 1217 didn't jump to line 1218, because the condition on line 1217 was never true

1218 raise ValueError( 

1219 "The supplied cache dataset ref refers to a different dataset than expected:" 

1220 f" {ref.id} != {cache_ref.id}" 

1221 ) 

1222 

1223 # Cannot recalculate checksum but can compare size as a quick check 

1224 # Do not do this if the size is negative since that indicates 

1225 # we do not know. 

1226 recorded_size = getInfo.info.file_size 

1227 resource_size = uri.size() 

1228 if recorded_size >= 0 and resource_size != recorded_size: 1228 ↛ 1229line 1228 didn't jump to line 1229, because the condition on line 1228 was never true

1229 raise RuntimeError( 

1230 "Integrity failure in Datastore. " 

1231 f"Size of file {uri} ({resource_size}) " 

1232 f"does not match size recorded in registry of {recorded_size}" 

1233 ) 

1234 

1235 # For the general case we have choices for how to proceed. 

1236 # 1. Always use a local file (downloading the remote resource to a 

1237 # temporary file if needed). 

1238 # 2. Use a threshold size and read into memory and use bytes. 

1239 # Use both for now with an arbitrary hand off size. 

1240 # This allows small datasets to be downloaded from remote object 

1241 # stores without requiring a temporary file. 

1242 

1243 formatter = getInfo.formatter 

1244 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1245 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1246 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1247 if cached_file is not None: 

1248 desired_uri = cached_file 

1249 msg = f" (cached version of {uri})" 

1250 else: 

1251 desired_uri = uri 

1252 msg = "" 

1253 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1254 serializedDataset = desired_uri.read() 

1255 log.debug( 

1256 "Deserializing %s from %d bytes from location %s with formatter %s", 

1257 f"component {getInfo.component}" if isComponent else "", 

1258 len(serializedDataset), 

1259 uri, 

1260 formatter.name(), 

1261 ) 

1262 try: 

1263 result = formatter.fromBytes( 

1264 serializedDataset, component=getInfo.component if isComponent else None 

1265 ) 

1266 except Exception as e: 

1267 raise ValueError( 

1268 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1269 f" ({ref.datasetType.name} from {uri}): {e}" 

1270 ) from e 

1271 else: 

1272 # Read from file. 

1273 

1274 # Have to update the Location associated with the formatter 

1275 # because formatter.read does not allow an override. 

1276 # This could be improved. 

1277 location_updated = False 

1278 msg = "" 

1279 

1280 # First check in cache for local version. 

1281 # The cache will only be relevant for remote resources but 

1282 # no harm in always asking. Context manager ensures that cache 

1283 # file is not deleted during cache expiration. 

1284 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1285 if cached_file is not None: 

1286 msg = f"(via cache read of remote file {uri})" 

1287 uri = cached_file 

1288 location_updated = True 

1289 

1290 with uri.as_local() as local_uri: 

1291 

1292 can_be_cached = False 

1293 if uri != local_uri: 1293 ↛ 1295line 1293 didn't jump to line 1295, because the condition on line 1293 was never true

1294 # URI was remote and file was downloaded 

1295 cache_msg = "" 

1296 location_updated = True 

1297 

1298 if self.cacheManager.should_be_cached(cache_ref): 

1299 # In this scenario we want to ask if the downloaded 

1300 # file should be cached but we should not cache 

1301 # it until after we've used it (to ensure it can't 

1302 # be expired whilst we are using it). 

1303 can_be_cached = True 

1304 

1305 # Say that it is "likely" to be cached because 

1306 # if the formatter read fails we will not be 

1307 # caching this file. 

1308 cache_msg = " and likely cached" 

1309 

1310 msg = f"(via download to local file{cache_msg})" 

1311 

1312 # Calculate the (possibly) new location for the formatter 

1313 # to use. 

1314 newLocation = Location(*local_uri.split()) if location_updated else None 

1315 

1316 log.debug( 

1317 "Reading%s from location %s %s with formatter %s", 

1318 f" component {getInfo.component}" if isComponent else "", 

1319 uri, 

1320 msg, 

1321 formatter.name(), 

1322 ) 

1323 try: 

1324 with formatter._updateLocation(newLocation): 

1325 with time_this( 

1326 log, 

1327 msg="Reading%s from location %s %s with formatter %s", 

1328 args=( 

1329 f" component {getInfo.component}" if isComponent else "", 

1330 uri, 

1331 msg, 

1332 formatter.name(), 

1333 ), 

1334 ): 

1335 result = formatter.read(component=getInfo.component if isComponent else None) 

1336 except Exception as e: 

1337 raise ValueError( 

1338 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1339 f" ({ref.datasetType.name} from {uri}): {e}" 

1340 ) from e 

1341 

1342 # File was read successfully so can move to cache 

1343 if can_be_cached: 1343 ↛ 1344line 1343 didn't jump to line 1344, because the condition on line 1343 was never true

1344 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1345 

1346 return self._post_process_get( 

1347 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1348 ) 

1349 

1350 def knows(self, ref: DatasetRef) -> bool: 

1351 """Check if the dataset is known to the datastore. 

1352 

1353 Does not check for existence of any artifact. 

1354 

1355 Parameters 

1356 ---------- 

1357 ref : `DatasetRef` 

1358 Reference to the required dataset. 

1359 

1360 Returns 

1361 ------- 

1362 exists : `bool` 

1363 `True` if the dataset is known to the datastore. 

1364 """ 

1365 fileLocations = self._get_dataset_locations_info(ref) 

1366 if fileLocations: 

1367 return True 

1368 return False 

1369 

1370 def _process_mexists_records( 

1371 self, 

1372 id_to_ref: Dict[DatasetId, DatasetRef], 

1373 records: Dict[DatasetId, List[StoredFileInfo]], 

1374 all_required: bool, 

1375 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1376 ) -> Dict[DatasetRef, bool]: 

1377 """Helper function for mexists that checks the given records. 

1378 

1379 Parameters 

1380 ---------- 

1381 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1382 Mapping of the dataset ID to the dataset ref itself. 

1383 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1384 Records as generally returned by 

1385 ``_get_stored_records_associated_with_refs``. 

1386 all_required : `bool` 

1387 Flag to indicate whether existence requires all artifacts 

1388 associated with a dataset ID to exist or not for existence. 

1389 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1390 Optional mapping of datastore artifact to existence. Updated by 

1391 this method with details of all artifacts tested. Can be `None` 

1392 if the caller is not interested. 

1393 

1394 Returns 

1395 ------- 

1396 existence : `dict` of [`DatasetRef`, `bool`] 

1397 Mapping from dataset to boolean indicating existence. 

1398 """ 

1399 # The URIs to be checked and a mapping of those URIs to 

1400 # the dataset ID. 

1401 uris_to_check: List[ResourcePath] = [] 

1402 location_map: Dict[ResourcePath, DatasetId] = {} 

1403 

1404 location_factory = self.locationFactory 

1405 

1406 uri_existence: Dict[ResourcePath, bool] = {} 

1407 for ref_id, infos in records.items(): 

1408 # Key is the dataset Id, value is list of StoredItemInfo 

1409 uris = [info.file_location(location_factory).uri for info in infos] 

1410 location_map.update({uri: ref_id for uri in uris}) 

1411 

1412 # Check the local cache directly for a dataset corresponding 

1413 # to the remote URI. 

1414 if self.cacheManager.file_count > 0: 

1415 ref = id_to_ref[ref_id] 

1416 for uri, storedFileInfo in zip(uris, infos): 

1417 check_ref = ref 

1418 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 1418 ↛ 1419line 1418 didn't jump to line 1419, because the condition on line 1418 was never true

1419 check_ref = ref.makeComponentRef(component) 

1420 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1421 # Proxy for URI existence. 

1422 uri_existence[uri] = True 

1423 else: 

1424 uris_to_check.append(uri) 

1425 else: 

1426 # Check all of them. 

1427 uris_to_check.extend(uris) 

1428 

1429 if artifact_existence is not None: 

1430 # If a URI has already been checked remove it from the list 

1431 # and immediately add the status to the output dict. 

1432 filtered_uris_to_check = [] 

1433 for uri in uris_to_check: 

1434 if uri in artifact_existence: 

1435 uri_existence[uri] = artifact_existence[uri] 

1436 else: 

1437 filtered_uris_to_check.append(uri) 

1438 uris_to_check = filtered_uris_to_check 

1439 

1440 # Results. 

1441 dataset_existence: Dict[DatasetRef, bool] = {} 

1442 

1443 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1444 for uri, exists in uri_existence.items(): 

1445 dataset_id = location_map[uri] 

1446 ref = id_to_ref[dataset_id] 

1447 

1448 # Disassembled composite needs to check all locations. 

1449 # all_required indicates whether all need to exist or not. 

1450 if ref in dataset_existence: 

1451 if all_required: 

1452 exists = dataset_existence[ref] and exists 

1453 else: 

1454 exists = dataset_existence[ref] or exists 

1455 dataset_existence[ref] = exists 

1456 

1457 if artifact_existence is not None: 

1458 artifact_existence.update(uri_existence) 

1459 

1460 return dataset_existence 

1461 

1462 def mexists( 

1463 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1464 ) -> Dict[DatasetRef, bool]: 

1465 """Check the existence of multiple datasets at once. 

1466 

1467 Parameters 

1468 ---------- 

1469 refs : iterable of `DatasetRef` 

1470 The datasets to be checked. 

1471 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1472 Optional mapping of datastore artifact to existence. Updated by 

1473 this method with details of all artifacts tested. Can be `None` 

1474 if the caller is not interested. 

1475 

1476 Returns 

1477 ------- 

1478 existence : `dict` of [`DatasetRef`, `bool`] 

1479 Mapping from dataset to boolean indicating existence. 

1480 

1481 Notes 

1482 ----- 

1483 To minimize potentially costly remote existence checks, the local 

1484 cache is checked as a proxy for existence. If a file for this 

1485 `DatasetRef` does exist no check is done for the actual URI. This 

1486 could result in possibly unexpected behavior if the dataset itself 

1487 has been removed from the datastore by another process whilst it is 

1488 still in the cache. 

1489 """ 

1490 chunk_size = 10_000 

1491 dataset_existence: Dict[DatasetRef, bool] = {} 

1492 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1493 n_found_total = 0 

1494 n_checked = 0 

1495 n_chunks = 0 

1496 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1497 chunk_result = self._mexists(chunk, artifact_existence) 

1498 if log.isEnabledFor(VERBOSE): 

1499 n_results = len(chunk_result) 

1500 n_checked += n_results 

1501 # Can treat the booleans as 0, 1 integers and sum them. 

1502 n_found = sum(chunk_result.values()) 

1503 n_found_total += n_found 

1504 log.verbose( 

1505 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1506 n_chunks, 

1507 n_found, 

1508 n_results, 

1509 n_found_total, 

1510 n_checked, 

1511 ) 

1512 dataset_existence.update(chunk_result) 

1513 n_chunks += 1 

1514 

1515 return dataset_existence 

1516 

1517 def _mexists( 

1518 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1519 ) -> Dict[DatasetRef, bool]: 

1520 """Check the existence of multiple datasets at once. 

1521 

1522 Parameters 

1523 ---------- 

1524 refs : iterable of `DatasetRef` 

1525 The datasets to be checked. 

1526 

1527 Returns 

1528 ------- 

1529 existence : `dict` of [`DatasetRef`, `bool`] 

1530 Mapping from dataset to boolean indicating existence. 

1531 """ 

1532 # Need a mapping of dataset_id to dataset ref since the API 

1533 # works with dataset_id 

1534 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1535 

1536 # Set of all IDs we are checking for. 

1537 requested_ids = set(id_to_ref.keys()) 

1538 

1539 # The records themselves. Could be missing some entries. 

1540 records = self._get_stored_records_associated_with_refs(refs) 

1541 

1542 dataset_existence = self._process_mexists_records( 

1543 id_to_ref, records, True, artifact_existence=artifact_existence 

1544 ) 

1545 

1546 # Set of IDs that have been handled. 

1547 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1548 

1549 missing_ids = requested_ids - handled_ids 

1550 if missing_ids: 

1551 if not self.trustGetRequest: 

1552 # Must assume these do not exist 

1553 for missing in missing_ids: 

1554 dataset_existence[id_to_ref[missing]] = False 

1555 else: 

1556 log.debug( 

1557 "%d out of %d datasets were not known to datastore during initial existence check.", 

1558 len(missing_ids), 

1559 len(requested_ids), 

1560 ) 

1561 

1562 # Construct data structure identical to that returned 

1563 # by _get_stored_records_associated_with_refs() but using 

1564 # guessed names. 

1565 records = {} 

1566 for missing in missing_ids: 

1567 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1568 records[missing] = [info for _, info in expected] 

1569 

1570 dataset_existence.update( 

1571 self._process_mexists_records( 

1572 id_to_ref, records, False, artifact_existence=artifact_existence 

1573 ) 

1574 ) 

1575 

1576 return dataset_existence 

1577 

1578 def exists(self, ref: DatasetRef) -> bool: 

1579 """Check if the dataset exists in the datastore. 

1580 

1581 Parameters 

1582 ---------- 

1583 ref : `DatasetRef` 

1584 Reference to the required dataset. 

1585 

1586 Returns 

1587 ------- 

1588 exists : `bool` 

1589 `True` if the entity exists in the `Datastore`. 

1590 

1591 Notes 

1592 ----- 

1593 The local cache is checked as a proxy for existence in the remote 

1594 object store. It is possible that another process on a different 

1595 compute node could remove the file from the object store even 

1596 though it is present in the local cache. 

1597 """ 

1598 fileLocations = self._get_dataset_locations_info(ref) 

1599 

1600 # if we are being asked to trust that registry might not be correct 

1601 # we ask for the expected locations and check them explicitly 

1602 if not fileLocations: 

1603 if not self.trustGetRequest: 

1604 return False 

1605 

1606 # First check the cache. If it is not found we must check 

1607 # the datastore itself. Assume that any component in the cache 

1608 # means that the dataset does exist somewhere. 

1609 if self.cacheManager.known_to_cache(ref): 1609 ↛ 1610line 1609 didn't jump to line 1610, because the condition on line 1609 was never true

1610 return True 

1611 

1612 # When we are guessing a dataset location we can not check 

1613 # for the existence of every component since we can not 

1614 # know if every component was written. Instead we check 

1615 # for the existence of any of the expected locations. 

1616 for location, _ in self._get_expected_dataset_locations_info(ref): 

1617 if self._artifact_exists(location): 

1618 return True 

1619 return False 

1620 

1621 # All listed artifacts must exist. 

1622 for location, storedFileInfo in fileLocations: 

1623 # Checking in cache needs the component ref. 

1624 check_ref = ref 

1625 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1626 check_ref = ref.makeComponentRef(component) 

1627 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1628 continue 

1629 

1630 if not self._artifact_exists(location): 

1631 return False 

1632 

1633 return True 

1634 

1635 def getURIs( 

1636 self, ref: DatasetRef, predict: bool = False 

1637 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1638 """Return URIs associated with dataset. 

1639 

1640 Parameters 

1641 ---------- 

1642 ref : `DatasetRef` 

1643 Reference to the required dataset. 

1644 predict : `bool`, optional 

1645 If the datastore does not know about the dataset, should it 

1646 return a predicted URI or not? 

1647 

1648 Returns 

1649 ------- 

1650 primary : `lsst.resources.ResourcePath` 

1651 The URI to the primary artifact associated with this dataset. 

1652 If the dataset was disassembled within the datastore this 

1653 may be `None`. 

1654 components : `dict` 

1655 URIs to any components associated with the dataset artifact. 

1656 Can be empty if there are no components. 

1657 """ 

1658 

1659 primary: Optional[ResourcePath] = None 

1660 components: Dict[str, ResourcePath] = {} 

1661 

1662 # if this has never been written then we have to guess 

1663 if not self.exists(ref): 

1664 if not predict: 

1665 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1666 

1667 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1668 

1669 if doDisassembly: 

1670 

1671 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1672 compRef = ref.makeComponentRef(component) 

1673 compLocation, _ = self._determine_put_formatter_location(compRef) 

1674 

1675 # Add a URI fragment to indicate this is a guess 

1676 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted") 

1677 

1678 else: 

1679 

1680 location, _ = self._determine_put_formatter_location(ref) 

1681 

1682 # Add a URI fragment to indicate this is a guess 

1683 primary = ResourcePath(location.uri.geturl() + "#predicted") 

1684 

1685 return primary, components 

1686 

1687 # If this is a ref that we have written we can get the path. 

1688 # Get file metadata and internal metadata 

1689 fileLocations = self._get_dataset_locations_info(ref) 

1690 

1691 guessing = False 

1692 if not fileLocations: 

1693 if not self.trustGetRequest: 1693 ↛ 1694line 1693 didn't jump to line 1694, because the condition on line 1693 was never true

1694 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1695 fileLocations = self._get_expected_dataset_locations_info(ref) 

1696 guessing = True 

1697 

1698 if len(fileLocations) == 1: 

1699 # No disassembly so this is the primary URI 

1700 uri = fileLocations[0][0].uri 

1701 if guessing and not uri.exists(): 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true

1702 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1703 primary = uri 

1704 

1705 else: 

1706 for location, storedFileInfo in fileLocations: 

1707 if storedFileInfo.component is None: 1707 ↛ 1708line 1707 didn't jump to line 1708, because the condition on line 1707 was never true

1708 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1709 uri = location.uri 

1710 if guessing and not uri.exists(): 1710 ↛ 1714line 1710 didn't jump to line 1714, because the condition on line 1710 was never true

1711 # If we are trusting then it is entirely possible for 

1712 # some components to be missing. In that case we skip 

1713 # to the next component. 

1714 if self.trustGetRequest: 

1715 continue 

1716 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1717 components[storedFileInfo.component] = uri 

1718 

1719 return primary, components 

1720 

1721 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1722 """URI to the Dataset. 

1723 

1724 Parameters 

1725 ---------- 

1726 ref : `DatasetRef` 

1727 Reference to the required Dataset. 

1728 predict : `bool` 

1729 If `True`, allow URIs to be returned of datasets that have not 

1730 been written. 

1731 

1732 Returns 

1733 ------- 

1734 uri : `str` 

1735 URI pointing to the dataset within the datastore. If the 

1736 dataset does not exist in the datastore, and if ``predict`` is 

1737 `True`, the URI will be a prediction and will include a URI 

1738 fragment "#predicted". 

1739 If the datastore does not have entities that relate well 

1740 to the concept of a URI the returned URI will be 

1741 descriptive. The returned URI is not guaranteed to be obtainable. 

1742 

1743 Raises 

1744 ------ 

1745 FileNotFoundError 

1746 Raised if a URI has been requested for a dataset that does not 

1747 exist and guessing is not allowed. 

1748 RuntimeError 

1749 Raised if a request is made for a single URI but multiple URIs 

1750 are associated with this dataset. 

1751 

1752 Notes 

1753 ----- 

1754 When a predicted URI is requested an attempt will be made to form 

1755 a reasonable URI based on file templates and the expected formatter. 

1756 """ 

1757 primary, components = self.getURIs(ref, predict) 

1758 if primary is None or components: 1758 ↛ 1759line 1758 didn't jump to line 1759, because the condition on line 1758 was never true

1759 raise RuntimeError( 

1760 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1761 ) 

1762 return primary 

1763 

1764 def retrieveArtifacts( 

1765 self, 

1766 refs: Iterable[DatasetRef], 

1767 destination: ResourcePath, 

1768 transfer: str = "auto", 

1769 preserve_path: bool = True, 

1770 overwrite: bool = False, 

1771 ) -> List[ResourcePath]: 

1772 """Retrieve the file artifacts associated with the supplied refs. 

1773 

1774 Parameters 

1775 ---------- 

1776 refs : iterable of `DatasetRef` 

1777 The datasets for which file artifacts are to be retrieved. 

1778 A single ref can result in multiple files. The refs must 

1779 be resolved. 

1780 destination : `lsst.resources.ResourcePath` 

1781 Location to write the file artifacts. 

1782 transfer : `str`, optional 

1783 Method to use to transfer the artifacts. Must be one of the options 

1784 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1785 "move" is not allowed. 

1786 preserve_path : `bool`, optional 

1787 If `True` the full path of the file artifact within the datastore 

1788 is preserved. If `False` the final file component of the path 

1789 is used. 

1790 overwrite : `bool`, optional 

1791 If `True` allow transfers to overwrite existing files at the 

1792 destination. 

1793 

1794 Returns 

1795 ------- 

1796 targets : `list` of `lsst.resources.ResourcePath` 

1797 URIs of file artifacts in destination location. Order is not 

1798 preserved. 

1799 """ 

1800 if not destination.isdir(): 1800 ↛ 1801line 1800 didn't jump to line 1801, because the condition on line 1800 was never true

1801 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1802 

1803 if transfer == "move": 

1804 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1805 

1806 # Source -> Destination 

1807 # This also helps filter out duplicate DatasetRef in the request 

1808 # that will map to the same underlying file transfer. 

1809 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1810 

1811 for ref in refs: 

1812 locations = self._get_dataset_locations_info(ref) 

1813 for location, _ in locations: 

1814 source_uri = location.uri 

1815 target_path: ResourcePathExpression 

1816 if preserve_path: 

1817 target_path = location.pathInStore 

1818 if target_path.isabs(): 1818 ↛ 1821line 1818 didn't jump to line 1821, because the condition on line 1818 was never true

1819 # This is an absolute path to an external file. 

1820 # Use the full path. 

1821 target_path = target_path.relativeToPathRoot 

1822 else: 

1823 target_path = source_uri.basename() 

1824 target_uri = destination.join(target_path) 

1825 to_transfer[source_uri] = target_uri 

1826 

1827 # In theory can now parallelize the transfer 

1828 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1829 for source_uri, target_uri in to_transfer.items(): 

1830 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1831 

1832 return list(to_transfer.values()) 

1833 

1834 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1835 """Load an InMemoryDataset from the store. 

1836 

1837 Parameters 

1838 ---------- 

1839 ref : `DatasetRef` 

1840 Reference to the required Dataset. 

1841 parameters : `dict` 

1842 `StorageClass`-specific parameters that specify, for example, 

1843 a slice of the dataset to be loaded. 

1844 

1845 Returns 

1846 ------- 

1847 inMemoryDataset : `object` 

1848 Requested dataset or slice thereof as an InMemoryDataset. 

1849 

1850 Raises 

1851 ------ 

1852 FileNotFoundError 

1853 Requested dataset can not be retrieved. 

1854 TypeError 

1855 Return value from formatter has unexpected type. 

1856 ValueError 

1857 Formatter failed to process the dataset. 

1858 """ 

1859 allGetInfo = self._prepare_for_get(ref, parameters) 

1860 refComponent = ref.datasetType.component() 

1861 

1862 # Supplied storage class for the component being read 

1863 refStorageClass = ref.datasetType.storageClass 

1864 

1865 # Create mapping from component name to related info 

1866 allComponents = {i.component: i for i in allGetInfo} 

1867 

1868 # By definition the dataset is disassembled if we have more 

1869 # than one record for it. 

1870 isDisassembled = len(allGetInfo) > 1 

1871 

1872 # Look for the special case where we are disassembled but the 

1873 # component is a derived component that was not written during 

1874 # disassembly. For this scenario we need to check that the 

1875 # component requested is listed as a derived component for the 

1876 # composite storage class 

1877 isDisassembledReadOnlyComponent = False 

1878 if isDisassembled and refComponent: 

1879 # The composite storage class should be accessible through 

1880 # the component dataset type 

1881 compositeStorageClass = ref.datasetType.parentStorageClass 

1882 

1883 # In the unlikely scenario where the composite storage 

1884 # class is not known, we can only assume that this is a 

1885 # normal component. If that assumption is wrong then the 

1886 # branch below that reads a persisted component will fail 

1887 # so there is no need to complain here. 

1888 if compositeStorageClass is not None: 1888 ↛ 1891line 1888 didn't jump to line 1891, because the condition on line 1888 was never false

1889 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1890 

1891 if isDisassembled and not refComponent: 

1892 # This was a disassembled dataset spread over multiple files 

1893 # and we need to put them all back together again. 

1894 # Read into memory and then assemble 

1895 

1896 # Check that the supplied parameters are suitable for the type read 

1897 refStorageClass.validateParameters(parameters) 

1898 

1899 # We want to keep track of all the parameters that were not used 

1900 # by formatters. We assume that if any of the component formatters 

1901 # use a parameter that we do not need to apply it again in the 

1902 # assembler. 

1903 usedParams = set() 

1904 

1905 components: Dict[str, Any] = {} 

1906 for getInfo in allGetInfo: 

1907 # assemblerParams are parameters not understood by the 

1908 # associated formatter. 

1909 usedParams.update(set(getInfo.formatterParams)) 

1910 

1911 component = getInfo.component 

1912 

1913 if component is None: 1913 ↛ 1914line 1913 didn't jump to line 1914, because the condition on line 1913 was never true

1914 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1915 

1916 # We do not want the formatter to think it's reading 

1917 # a component though because it is really reading a 

1918 # standalone dataset -- always tell reader it is not a 

1919 # component. 

1920 components[component] = self._read_artifact_into_memory( 

1921 getInfo, ref.makeComponentRef(component), isComponent=False 

1922 ) 

1923 

1924 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1925 

1926 # Any unused parameters will have to be passed to the assembler 

1927 if parameters: 

1928 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1929 else: 

1930 unusedParams = {} 

1931 

1932 # Process parameters 

1933 return ref.datasetType.storageClass.delegate().handleParameters( 

1934 inMemoryDataset, parameters=unusedParams 

1935 ) 

1936 

1937 elif isDisassembledReadOnlyComponent: 

1938 

1939 compositeStorageClass = ref.datasetType.parentStorageClass 

1940 if compositeStorageClass is None: 1940 ↛ 1941line 1940 didn't jump to line 1941, because the condition on line 1940 was never true

1941 raise RuntimeError( 

1942 f"Unable to retrieve derived component '{refComponent}' since" 

1943 "no composite storage class is available." 

1944 ) 

1945 

1946 if refComponent is None: 1946 ↛ 1948line 1946 didn't jump to line 1948, because the condition on line 1946 was never true

1947 # Mainly for mypy 

1948 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1949 

1950 # Assume that every derived component can be calculated by 

1951 # forwarding the request to a single read/write component. 

1952 # Rather than guessing which rw component is the right one by 

1953 # scanning each for a derived component of the same name, 

1954 # we ask the storage class delegate directly which one is best to 

1955 # use. 

1956 compositeDelegate = compositeStorageClass.delegate() 

1957 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

1958 refComponent, set(allComponents) 

1959 ) 

1960 

1961 # Select the relevant component 

1962 rwInfo = allComponents[forwardedComponent] 

1963 

1964 # For now assume that read parameters are validated against 

1965 # the real component and not the requested component 

1966 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1967 forwardedStorageClass.validateParameters(parameters) 

1968 

1969 # The reference to use for the caching must refer to the forwarded 

1970 # component and not the derived component. 

1971 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1972 

1973 # Unfortunately the FileDescriptor inside the formatter will have 

1974 # the wrong write storage class so we need to create a new one 

1975 # given the immutability constraint. 

1976 writeStorageClass = rwInfo.info.storageClass 

1977 

1978 # We may need to put some thought into parameters for read 

1979 # components but for now forward them on as is 

1980 readFormatter = type(rwInfo.formatter)( 

1981 FileDescriptor( 

1982 rwInfo.location, 

1983 readStorageClass=refStorageClass, 

1984 storageClass=writeStorageClass, 

1985 parameters=parameters, 

1986 ), 

1987 ref.dataId, 

1988 ) 

1989 

1990 # The assembler can not receive any parameter requests for a 

1991 # derived component at this time since the assembler will 

1992 # see the storage class of the derived component and those 

1993 # parameters will have to be handled by the formatter on the 

1994 # forwarded storage class. 

1995 assemblerParams: Dict[str, Any] = {} 

1996 

1997 # Need to created a new info that specifies the derived 

1998 # component and associated storage class 

1999 readInfo = DatastoreFileGetInformation( 

2000 rwInfo.location, 

2001 readFormatter, 

2002 rwInfo.info, 

2003 assemblerParams, 

2004 {}, 

2005 refComponent, 

2006 refStorageClass, 

2007 ) 

2008 

2009 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2010 

2011 else: 

2012 # Single file request or component from that composite file 

2013 for lookup in (refComponent, None): 2013 ↛ 2018line 2013 didn't jump to line 2018, because the loop on line 2013 didn't complete

2014 if lookup in allComponents: 2014 ↛ 2013line 2014 didn't jump to line 2013, because the condition on line 2014 was never false

2015 getInfo = allComponents[lookup] 

2016 break 

2017 else: 

2018 raise FileNotFoundError( 

2019 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2020 ) 

2021 

2022 # Do not need the component itself if already disassembled 

2023 if isDisassembled: 

2024 isComponent = False 

2025 else: 

2026 isComponent = getInfo.component is not None 

2027 

2028 # For a component read of a composite we want the cache to 

2029 # be looking at the composite ref itself. 

2030 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2031 

2032 # For a disassembled component we can validate parametersagainst 

2033 # the component storage class directly 

2034 if isDisassembled: 

2035 refStorageClass.validateParameters(parameters) 

2036 else: 

2037 # For an assembled composite this could be a derived 

2038 # component derived from a real component. The validity 

2039 # of the parameters is not clear. For now validate against 

2040 # the composite storage class 

2041 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2042 

2043 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2044 

2045 @transactional 

2046 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2047 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2048 

2049 Parameters 

2050 ---------- 

2051 inMemoryDataset : `object` 

2052 The dataset to store. 

2053 ref : `DatasetRef` 

2054 Reference to the associated Dataset. 

2055 

2056 Raises 

2057 ------ 

2058 TypeError 

2059 Supplied object and storage class are inconsistent. 

2060 DatasetTypeNotSupportedError 

2061 The associated `DatasetType` is not handled by this datastore. 

2062 

2063 Notes 

2064 ----- 

2065 If the datastore is configured to reject certain dataset types it 

2066 is possible that the put will fail and raise a 

2067 `DatasetTypeNotSupportedError`. The main use case for this is to 

2068 allow `ChainedDatastore` to put to multiple datastores without 

2069 requiring that every datastore accepts the dataset. 

2070 """ 

2071 

2072 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2073 # doDisassembly = True 

2074 

2075 artifacts = [] 

2076 if doDisassembly: 

2077 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2078 if components is None: 2078 ↛ 2079line 2078 didn't jump to line 2079, because the condition on line 2078 was never true

2079 raise RuntimeError( 

2080 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2081 f"with storage class {ref.datasetType.storageClass.name} " 

2082 "is configured to be disassembled, but cannot be." 

2083 ) 

2084 for component, componentInfo in components.items(): 

2085 # Don't recurse because we want to take advantage of 

2086 # bulk insert -- need a new DatasetRef that refers to the 

2087 # same dataset_id but has the component DatasetType 

2088 # DatasetType does not refer to the types of components 

2089 # So we construct one ourselves. 

2090 compRef = ref.makeComponentRef(component) 

2091 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2092 artifacts.append((compRef, storedInfo)) 

2093 else: 

2094 # Write the entire thing out 

2095 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2096 artifacts.append((ref, storedInfo)) 

2097 

2098 self._register_datasets(artifacts) 

2099 

2100 @transactional 

2101 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2102 # At this point can safely remove these datasets from the cache 

2103 # to avoid confusion later on. If they are not trashed later 

2104 # the cache will simply be refilled. 

2105 self.cacheManager.remove_from_cache(ref) 

2106 

2107 # If we are in trust mode there will be nothing to move to 

2108 # the trash table and we will have to try to delete the file 

2109 # immediately. 

2110 if self.trustGetRequest: 

2111 # Try to keep the logic below for a single file trash. 

2112 if isinstance(ref, DatasetRef): 

2113 refs = {ref} 

2114 else: 

2115 # Will recreate ref at the end of this branch. 

2116 refs = set(ref) 

2117 

2118 # Determine which datasets are known to datastore directly. 

2119 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2120 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2121 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2122 

2123 missing = refs - existing_refs 

2124 if missing: 

2125 # Do an explicit existence check on these refs. 

2126 # We only care about the artifacts at this point and not 

2127 # the dataset existence. 

2128 artifact_existence: Dict[ResourcePath, bool] = {} 

2129 _ = self.mexists(missing, artifact_existence) 

2130 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2131 

2132 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2133 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2134 for uri in uris: 

2135 try: 

2136 uri.remove() 

2137 except Exception as e: 

2138 if ignore_errors: 

2139 log.debug("Artifact %s could not be removed: %s", uri, e) 

2140 continue 

2141 raise 

2142 

2143 # There is no point asking the code below to remove refs we 

2144 # know are missing so update it with the list of existing 

2145 # records. Try to retain one vs many logic. 

2146 if not existing_refs: 

2147 # Nothing more to do since none of the datasets were 

2148 # known to the datastore record table. 

2149 return 

2150 ref = list(existing_refs) 

2151 if len(ref) == 1: 

2152 ref = ref[0] 

2153 

2154 # Get file metadata and internal metadata 

2155 if not isinstance(ref, DatasetRef): 

2156 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2157 # Assumed to be an iterable of refs so bulk mode enabled. 

2158 try: 

2159 self.bridge.moveToTrash(ref) 

2160 except Exception as e: 

2161 if ignore_errors: 

2162 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2163 else: 

2164 raise 

2165 return 

2166 

2167 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2168 

2169 fileLocations = self._get_dataset_locations_info(ref) 

2170 

2171 if not fileLocations: 

2172 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2173 if ignore_errors: 

2174 log.warning(err_msg) 

2175 return 

2176 else: 

2177 raise FileNotFoundError(err_msg) 

2178 

2179 for location, storedFileInfo in fileLocations: 

2180 if not self._artifact_exists(location): 2180 ↛ 2181line 2180 didn't jump to line 2181

2181 err_msg = ( 

2182 f"Dataset is known to datastore {self.name} but " 

2183 f"associated artifact ({location.uri}) is missing" 

2184 ) 

2185 if ignore_errors: 

2186 log.warning(err_msg) 

2187 return 

2188 else: 

2189 raise FileNotFoundError(err_msg) 

2190 

2191 # Mark dataset as trashed 

2192 try: 

2193 self.bridge.moveToTrash([ref]) 

2194 except Exception as e: 

2195 if ignore_errors: 

2196 log.warning( 

2197 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2198 "but encountered an error: %s", 

2199 ref, 

2200 self.name, 

2201 e, 

2202 ) 

2203 pass 

2204 else: 

2205 raise 

2206 

2207 @transactional 

2208 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2209 """Remove all datasets from the trash. 

2210 

2211 Parameters 

2212 ---------- 

2213 ignore_errors : `bool` 

2214 If `True` return without error even if something went wrong. 

2215 Problems could occur if another process is simultaneously trying 

2216 to delete. 

2217 """ 

2218 log.debug("Emptying trash in datastore %s", self.name) 

2219 

2220 # Context manager will empty trash iff we finish it without raising. 

2221 # It will also automatically delete the relevant rows from the 

2222 # trash table and the records table. 

2223 with self.bridge.emptyTrash( 

2224 self._table, record_class=StoredFileInfo, record_column="path" 

2225 ) as trash_data: 

2226 # Removing the artifacts themselves requires that the files are 

2227 # not also associated with refs that are not to be trashed. 

2228 # Therefore need to do a query with the file paths themselves 

2229 # and return all the refs associated with them. Can only delete 

2230 # a file if the refs to be trashed are the only refs associated 

2231 # with the file. 

2232 # This requires multiple copies of the trashed items 

2233 trashed, artifacts_to_keep = trash_data 

2234 

2235 if artifacts_to_keep is None: 

2236 # The bridge is not helping us so have to work it out 

2237 # ourselves. This is not going to be as efficient. 

2238 trashed = list(trashed) 

2239 

2240 # The instance check is for mypy since up to this point it 

2241 # does not know the type of info. 

2242 path_map = self._refs_associated_with_artifacts( 

2243 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2244 ) 

2245 

2246 for ref, info in trashed: 

2247 

2248 # Mypy needs to know this is not the base class 

2249 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2250 

2251 # Check for mypy 

2252 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2253 

2254 path_map[info.path].remove(ref.id) 

2255 if not path_map[info.path]: 2255 ↛ 2246line 2255 didn't jump to line 2246, because the condition on line 2255 was never false

2256 del path_map[info.path] 

2257 

2258 artifacts_to_keep = set(path_map) 

2259 

2260 for ref, info in trashed: 

2261 

2262 # Should not happen for this implementation but need 

2263 # to keep mypy happy. 

2264 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2265 

2266 # Mypy needs to know this is not the base class 

2267 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2268 

2269 # Check for mypy 

2270 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2271 

2272 if info.path in artifacts_to_keep: 

2273 # This is a multi-dataset artifact and we are not 

2274 # removing all associated refs. 

2275 continue 

2276 

2277 # Only trashed refs still known to datastore will be returned. 

2278 location = info.file_location(self.locationFactory) 

2279 

2280 # Point of no return for this artifact 

2281 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2282 try: 

2283 self._delete_artifact(location) 

2284 except FileNotFoundError: 

2285 # If the file itself has been deleted there is nothing 

2286 # we can do about it. It is possible that trash has 

2287 # been run in parallel in another process or someone 

2288 # decided to delete the file. It is unlikely to come 

2289 # back and so we should still continue with the removal 

2290 # of the entry from the trash table. It is also possible 

2291 # we removed it in a previous iteration if it was 

2292 # a multi-dataset artifact. The delete artifact method 

2293 # will log a debug message in this scenario. 

2294 # Distinguishing file missing before trash started and 

2295 # file already removed previously as part of this trash 

2296 # is not worth the distinction with regards to potential 

2297 # memory cost. 

2298 pass 

2299 except Exception as e: 

2300 if ignore_errors: 

2301 # Use a debug message here even though it's not 

2302 # a good situation. In some cases this can be 

2303 # caused by a race between user A and user B 

2304 # and neither of them has permissions for the 

2305 # other's files. Butler does not know about users 

2306 # and trash has no idea what collections these 

2307 # files were in (without guessing from a path). 

2308 log.debug( 

2309 "Encountered error removing artifact %s from datastore %s: %s", 

2310 location.uri, 

2311 self.name, 

2312 e, 

2313 ) 

2314 else: 

2315 raise 

2316 

2317 @transactional 

2318 def transfer_from( 

2319 self, 

2320 source_datastore: Datastore, 

2321 refs: Iterable[DatasetRef], 

2322 local_refs: Optional[Iterable[DatasetRef]] = None, 

2323 transfer: str = "auto", 

2324 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2325 ) -> None: 

2326 # Docstring inherited 

2327 if type(self) is not type(source_datastore): 

2328 raise TypeError( 

2329 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2330 f"source datastore ({type(source_datastore)})." 

2331 ) 

2332 

2333 # Be explicit for mypy 

2334 if not isinstance(source_datastore, FileDatastore): 2334 ↛ 2335line 2334 didn't jump to line 2335, because the condition on line 2334 was never true

2335 raise TypeError( 

2336 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2337 f" {type(source_datastore)}" 

2338 ) 

2339 

2340 # Stop early if "direct" transfer mode is requested. That would 

2341 # require that the URI inside the source datastore should be stored 

2342 # directly in the target datastore, which seems unlikely to be useful 

2343 # since at any moment the source datastore could delete the file. 

2344 if transfer in ("direct", "split"): 

2345 raise ValueError( 

2346 f"Can not transfer from a source datastore using {transfer} mode since" 

2347 " those files are controlled by the other datastore." 

2348 ) 

2349 

2350 # Empty existence lookup if none given. 

2351 if artifact_existence is None: 

2352 artifact_existence = {} 

2353 

2354 # We will go through the list multiple times so must convert 

2355 # generators to lists. 

2356 refs = list(refs) 

2357 

2358 if local_refs is None: 

2359 local_refs = refs 

2360 else: 

2361 local_refs = list(local_refs) 

2362 

2363 # In order to handle disassembled composites the code works 

2364 # at the records level since it can assume that internal APIs 

2365 # can be used. 

2366 # - If the record already exists in the destination this is assumed 

2367 # to be okay. 

2368 # - If there is no record but the source and destination URIs are 

2369 # identical no transfer is done but the record is added. 

2370 # - If the source record refers to an absolute URI currently assume 

2371 # that that URI should remain absolute and will be visible to the 

2372 # destination butler. May need to have a flag to indicate whether 

2373 # the dataset should be transferred. This will only happen if 

2374 # the detached Butler has had a local ingest. 

2375 

2376 # What we really want is all the records in the source datastore 

2377 # associated with these refs. Or derived ones if they don't exist 

2378 # in the source. 

2379 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2380 

2381 # The source dataset_ids are the keys in these records 

2382 source_ids = set(source_records) 

2383 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2384 

2385 # The not None check is to appease mypy 

2386 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2387 missing_ids = requested_ids - source_ids 

2388 

2389 # Missing IDs can be okay if that datastore has allowed 

2390 # gets based on file existence. Should we transfer what we can 

2391 # or complain about it and warn? 

2392 if missing_ids and not source_datastore.trustGetRequest: 2392 ↛ 2393line 2392 didn't jump to line 2393, because the condition on line 2392 was never true

2393 raise ValueError( 

2394 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2395 ) 

2396 

2397 # Need to map these missing IDs to a DatasetRef so we can guess 

2398 # the details. 

2399 if missing_ids: 

2400 log.info( 

2401 "Number of expected datasets missing from source datastore records: %d out of %d", 

2402 len(missing_ids), 

2403 len(requested_ids), 

2404 ) 

2405 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2406 

2407 # This should be chunked in case we end up having to check 

2408 # the file store since we need some log output to show 

2409 # progress. 

2410 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2411 records = {} 

2412 for missing in missing_ids_chunk: 

2413 # Ask the source datastore where the missing artifacts 

2414 # should be. An execution butler might not know about the 

2415 # artifacts even if they are there. 

2416 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2417 records[missing] = [info for _, info in expected] 

2418 

2419 # Call the mexist helper method in case we have not already 

2420 # checked these artifacts such that artifact_existence is 

2421 # empty. This allows us to benefit from parallelism. 

2422 # datastore.mexists() itself does not give us access to the 

2423 # derived datastore record. 

2424 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2425 ref_exists = source_datastore._process_mexists_records( 

2426 id_to_ref, records, False, artifact_existence=artifact_existence 

2427 ) 

2428 

2429 # Now go through the records and propagate the ones that exist. 

2430 location_factory = source_datastore.locationFactory 

2431 for missing, record_list in records.items(): 

2432 # Skip completely if the ref does not exist. 

2433 ref = id_to_ref[missing] 

2434 if not ref_exists[ref]: 

2435 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2436 continue 

2437 # Check for file artifact to decide which parts of a 

2438 # disassembled composite do exist. If there is only a 

2439 # single record we don't even need to look because it can't 

2440 # be a composite and must exist. 

2441 if len(record_list) == 1: 

2442 dataset_records = record_list 

2443 else: 

2444 dataset_records = [ 

2445 record 

2446 for record in record_list 

2447 if artifact_existence[record.file_location(location_factory).uri] 

2448 ] 

2449 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2450 

2451 # Rely on source_records being a defaultdict. 

2452 source_records[missing].extend(dataset_records) 

2453 

2454 # See if we already have these records 

2455 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2456 

2457 # The artifacts to register 

2458 artifacts = [] 

2459 

2460 # Refs that already exist 

2461 already_present = [] 

2462 

2463 # Now can transfer the artifacts 

2464 for source_ref, target_ref in zip(refs, local_refs): 

2465 if target_ref.id in target_records: 

2466 # Already have an artifact for this. 

2467 already_present.append(target_ref) 

2468 continue 

2469 

2470 # mypy needs to know these are always resolved refs 

2471 for info in source_records[source_ref.getCheckedId()]: 

2472 source_location = info.file_location(source_datastore.locationFactory) 

2473 target_location = info.file_location(self.locationFactory) 

2474 if source_location == target_location: 2474 ↛ 2478line 2474 didn't jump to line 2478, because the condition on line 2474 was never true

2475 # Either the dataset is already in the target datastore 

2476 # (which is how execution butler currently runs) or 

2477 # it is an absolute URI. 

2478 if source_location.pathInStore.isabs(): 

2479 # Just because we can see the artifact when running 

2480 # the transfer doesn't mean it will be generally 

2481 # accessible to a user of this butler. For now warn 

2482 # but assume it will be accessible. 

2483 log.warning( 

2484 "Transfer request for an outside-datastore artifact has been found at %s", 

2485 source_location, 

2486 ) 

2487 else: 

2488 # Need to transfer it to the new location. 

2489 # Assume we should always overwrite. If the artifact 

2490 # is there this might indicate that a previous transfer 

2491 # was interrupted but was not able to be rolled back 

2492 # completely (eg pre-emption) so follow Datastore default 

2493 # and overwrite. 

2494 target_location.uri.transfer_from( 

2495 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2496 ) 

2497 

2498 artifacts.append((target_ref, info)) 

2499 

2500 self._register_datasets(artifacts) 

2501 

2502 if already_present: 

2503 n_skipped = len(already_present) 

2504 log.info( 

2505 "Skipped transfer of %d dataset%s already present in datastore", 

2506 n_skipped, 

2507 "" if n_skipped == 1 else "s", 

2508 ) 

2509 

2510 @transactional 

2511 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2512 # Docstring inherited. 

2513 refs = list(refs) 

2514 self.bridge.forget(refs) 

2515 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2516 

2517 def validateConfiguration( 

2518 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2519 ) -> None: 

2520 """Validate some of the configuration for this datastore. 

2521 

2522 Parameters 

2523 ---------- 

2524 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2525 Entities to test against this configuration. Can be differing 

2526 types. 

2527 logFailures : `bool`, optional 

2528 If `True`, output a log message for every validation error 

2529 detected. 

2530 

2531 Raises 

2532 ------ 

2533 DatastoreValidationError 

2534 Raised if there is a validation problem with a configuration. 

2535 All the problems are reported in a single exception. 

2536 

2537 Notes 

2538 ----- 

2539 This method checks that all the supplied entities have valid file 

2540 templates and also have formatters defined. 

2541 """ 

2542 

2543 templateFailed = None 

2544 try: 

2545 self.templates.validateTemplates(entities, logFailures=logFailures) 

2546 except FileTemplateValidationError as e: 

2547 templateFailed = str(e) 

2548 

2549 formatterFailed = [] 

2550 for entity in entities: 

2551 try: 

2552 self.formatterFactory.getFormatterClass(entity) 

2553 except KeyError as e: 

2554 formatterFailed.append(str(e)) 

2555 if logFailures: 2555 ↛ 2550line 2555 didn't jump to line 2550, because the condition on line 2555 was never false

2556 log.critical("Formatter failure: %s", e) 

2557 

2558 if templateFailed or formatterFailed: 

2559 messages = [] 

2560 if templateFailed: 2560 ↛ 2561line 2560 didn't jump to line 2561, because the condition on line 2560 was never true

2561 messages.append(templateFailed) 

2562 if formatterFailed: 2562 ↛ 2564line 2562 didn't jump to line 2564, because the condition on line 2562 was never false

2563 messages.append(",".join(formatterFailed)) 

2564 msg = ";\n".join(messages) 

2565 raise DatastoreValidationError(msg) 

2566 

2567 def getLookupKeys(self) -> Set[LookupKey]: 

2568 # Docstring is inherited from base class 

2569 return ( 

2570 self.templates.getLookupKeys() 

2571 | self.formatterFactory.getLookupKeys() 

2572 | self.constraints.getLookupKeys() 

2573 ) 

2574 

2575 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2576 # Docstring is inherited from base class 

2577 # The key can be valid in either formatters or templates so we can 

2578 # only check the template if it exists 

2579 if lookupKey in self.templates: 

2580 try: 

2581 self.templates[lookupKey].validateTemplate(entity) 

2582 except FileTemplateValidationError as e: 

2583 raise DatastoreValidationError(e) from e 

2584 

2585 def export( 

2586 self, 

2587 refs: Iterable[DatasetRef], 

2588 *, 

2589 directory: Optional[ResourcePathExpression] = None, 

2590 transfer: Optional[str] = "auto", 

2591 ) -> Iterable[FileDataset]: 

2592 # Docstring inherited from Datastore.export. 

2593 if transfer is not None and directory is None: 2593 ↛ 2594line 2593 didn't jump to line 2594, because the condition on line 2593 was never true

2594 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2595 

2596 # Force the directory to be a URI object 

2597 directoryUri: Optional[ResourcePath] = None 

2598 if directory is not None: 2598 ↛ 2601line 2598 didn't jump to line 2601, because the condition on line 2598 was never false

2599 directoryUri = ResourcePath(directory, forceDirectory=True) 

2600 

2601 if transfer is not None and directoryUri is not None: 2601 ↛ 2606line 2601 didn't jump to line 2606, because the condition on line 2601 was never false

2602 # mypy needs the second test 

2603 if not directoryUri.exists(): 2603 ↛ 2604line 2603 didn't jump to line 2604, because the condition on line 2603 was never true

2604 raise FileNotFoundError(f"Export location {directory} does not exist") 

2605 

2606 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2607 for ref in progress.wrap(refs, "Exporting dataset files"): 

2608 fileLocations = self._get_dataset_locations_info(ref) 

2609 if not fileLocations: 2609 ↛ 2610line 2609 didn't jump to line 2610, because the condition on line 2609 was never true

2610 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2611 # For now we can not export disassembled datasets 

2612 if len(fileLocations) > 1: 

2613 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2614 location, storedFileInfo = fileLocations[0] 

2615 

2616 pathInStore = location.pathInStore.path 

2617 if transfer is None: 2617 ↛ 2621line 2617 didn't jump to line 2621, because the condition on line 2617 was never true

2618 # TODO: do we also need to return the readStorageClass somehow? 

2619 # We will use the path in store directly. If this is an 

2620 # absolute URI, preserve it. 

2621 if location.pathInStore.isabs(): 

2622 pathInStore = str(location.uri) 

2623 elif transfer == "direct": 2623 ↛ 2625line 2623 didn't jump to line 2625, because the condition on line 2623 was never true

2624 # Use full URIs to the remote store in the export 

2625 pathInStore = str(location.uri) 

2626 else: 

2627 # mypy needs help 

2628 assert directoryUri is not None, "directoryUri must be defined to get here" 

2629 storeUri = ResourcePath(location.uri) 

2630 

2631 # if the datastore has an absolute URI to a resource, we 

2632 # have two options: 

2633 # 1. Keep the absolute URI in the exported YAML 

2634 # 2. Allocate a new name in the local datastore and transfer 

2635 # it. 

2636 # For now go with option 2 

2637 if location.pathInStore.isabs(): 2637 ↛ 2638line 2637 didn't jump to line 2638, because the condition on line 2637 was never true

2638 template = self.templates.getTemplate(ref) 

2639 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2640 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2641 

2642 exportUri = directoryUri.join(pathInStore) 

2643 exportUri.transfer_from(storeUri, transfer=transfer) 

2644 

2645 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2646 

2647 @staticmethod 

2648 def computeChecksum( 

2649 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2650 ) -> Optional[str]: 

2651 """Compute the checksum of the supplied file. 

2652 

2653 Parameters 

2654 ---------- 

2655 uri : `lsst.resources.ResourcePath` 

2656 Name of resource to calculate checksum from. 

2657 algorithm : `str`, optional 

2658 Name of algorithm to use. Must be one of the algorithms supported 

2659 by :py:class`hashlib`. 

2660 block_size : `int` 

2661 Number of bytes to read from file at one time. 

2662 

2663 Returns 

2664 ------- 

2665 hexdigest : `str` 

2666 Hex digest of the file. 

2667 

2668 Notes 

2669 ----- 

2670 Currently returns None if the URI is for a remote resource. 

2671 """ 

2672 if algorithm not in hashlib.algorithms_guaranteed: 2672 ↛ 2673line 2672 didn't jump to line 2673, because the condition on line 2672 was never true

2673 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2674 

2675 if not uri.isLocal: 2675 ↛ 2676line 2675 didn't jump to line 2676, because the condition on line 2675 was never true

2676 return None 

2677 

2678 hasher = hashlib.new(algorithm) 

2679 

2680 with uri.as_local() as local_uri: 

2681 with open(local_uri.ospath, "rb") as f: 

2682 for chunk in iter(lambda: f.read(block_size), b""): 

2683 hasher.update(chunk) 

2684 

2685 return hasher.hexdigest() 

2686 

2687 def needs_expanded_data_ids( 

2688 self, 

2689 transfer: Optional[str], 

2690 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2691 ) -> bool: 

2692 # Docstring inherited. 

2693 # This _could_ also use entity to inspect whether the filename template 

2694 # involves placeholders other than the required dimensions for its 

2695 # dataset type, but that's not necessary for correctness; it just 

2696 # enables more optimizations (perhaps only in theory). 

2697 return transfer not in ("direct", None) 

2698 

2699 def import_records(self, data: Mapping[str, DatastoreRecordData]) -> None: 

2700 # Docstring inherited from the base class. 

2701 record_data = data.get(self.name) 

2702 if not record_data: 2702 ↛ 2703line 2702 didn't jump to line 2703, because the condition on line 2702 was never true

2703 return 

2704 

2705 if record_data.refs: 

2706 self._bridge.insert(record_data.refs) 

2707 

2708 # TODO: Verify that there are no unexpected table names in the dict? 

2709 records = record_data.records.get(self._table.name) 

2710 if records: 

2711 unpacked_records = [] 

2712 for info in records: 

2713 assert isinstance(info, StoredFileInfo), "Expecting StoredFileInfo records" 

2714 unpacked_records.append(info.to_record()) 

2715 self._table.insert(*unpacked_records) 

2716 

2717 def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, DatastoreRecordData]: 

2718 # Docstring inherited from the base class. 

2719 exported_refs = list(self._bridge.check(refs)) 

2720 

2721 id2ref = {ref.id: ref for ref in exported_refs} 

2722 rows = self._table.fetch(dataset_id=list(id2ref.keys())) 

2723 records: List[StoredDatastoreItemInfo] = [StoredFileInfo.from_record(row) for row in rows] 

2724 

2725 record_data = DatastoreRecordData(refs=exported_refs, records={self._table.name: records}) 

2726 return {self.name: record_data}