Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 84%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

862 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Type, 

43 Union, 

44) 

45 

46from lsst.daf.butler import ( 

47 CompositesMap, 

48 Config, 

49 DatasetId, 

50 DatasetRef, 

51 DatasetType, 

52 DatasetTypeNotSupportedError, 

53 Datastore, 

54 DatastoreCacheManager, 

55 DatastoreConfig, 

56 DatastoreDisabledCacheManager, 

57 DatastoreValidationError, 

58 FileDataset, 

59 FileDescriptor, 

60 FileTemplates, 

61 FileTemplateValidationError, 

62 Formatter, 

63 FormatterFactory, 

64 Location, 

65 LocationFactory, 

66 Progress, 

67 StorageClass, 

68 StoredFileInfo, 

69 ddl, 

70) 

71from lsst.daf.butler.core.repoRelocation import replaceRoot 

72from lsst.daf.butler.core.utils import transactional 

73from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

74from lsst.resources import ResourcePath, ResourcePathExpression 

75from lsst.utils.introspection import get_class_of, get_instance_of 

76from lsst.utils.iteration import chunk_iterable 

77 

78# For VERBOSE logging usage. 

79from lsst.utils.logging import VERBOSE, getLogger 

80from lsst.utils.timer import time_this 

81from sqlalchemy import BigInteger, String 

82 

83from .genericDatastore import GenericBaseDatastore 

84 

85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

88 

89log = getLogger(__name__) 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 

101 def __init__(self, datasets: List[FileDataset]): 

102 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

103 self.datasets = datasets 

104 

105 

106@dataclass(frozen=True) 

107class DatastoreFileGetInformation: 

108 """Collection of useful parameters needed to retrieve a file from 

109 a Datastore. 

110 """ 

111 

112 location: Location 

113 """The location from which to read the dataset.""" 

114 

115 formatter: Formatter 

116 """The `Formatter` to use to deserialize the dataset.""" 

117 

118 info: StoredFileInfo 

119 """Stored information about this file and its formatter.""" 

120 

121 assemblerParams: Mapping[str, Any] 

122 """Parameters to use for post-processing the retrieved dataset.""" 

123 

124 formatterParams: Mapping[str, Any] 

125 """Parameters that were understood by the associated formatter.""" 

126 

127 component: Optional[str] 

128 """The component to be retrieved (can be `None`).""" 

129 

130 readStorageClass: StorageClass 

131 """The `StorageClass` of the dataset being read.""" 

132 

133 

134class FileDatastore(GenericBaseDatastore): 

135 """Generic Datastore for file-based implementations. 

136 

137 Should always be sub-classed since key abstract methods are missing. 

138 

139 Parameters 

140 ---------- 

141 config : `DatastoreConfig` or `str` 

142 Configuration as either a `Config` object or URI to file. 

143 bridgeManager : `DatastoreRegistryBridgeManager` 

144 Object that manages the interface between `Registry` and datastores. 

145 butlerRoot : `str`, optional 

146 New datastore root to use to override the configuration value. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 If root location does not exist and ``create`` is `False` in the 

152 configuration. 

153 """ 

154 

155 defaultConfigFile: ClassVar[Optional[str]] = None 

156 """Path to configuration defaults. Accessed within the ``config`` resource 

157 or relative to a search path. Can be None if no defaults specified. 

158 """ 

159 

160 root: ResourcePath 

161 """Root directory URI of this `Datastore`.""" 

162 

163 locationFactory: LocationFactory 

164 """Factory for creating locations relative to the datastore root.""" 

165 

166 formatterFactory: FormatterFactory 

167 """Factory for creating instances of formatters.""" 

168 

169 templates: FileTemplates 

170 """File templates that can be used by this `Datastore`.""" 

171 

172 composites: CompositesMap 

173 """Determines whether a dataset should be disassembled on put.""" 

174 

175 defaultConfigFile = "datastores/fileDatastore.yaml" 

176 """Path to configuration defaults. Accessed within the ``config`` resource 

177 or relative to a search path. Can be None if no defaults specified. 

178 """ 

179 

180 @classmethod 

181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

182 """Set any filesystem-dependent config options for this Datastore to 

183 be appropriate for a new empty repository with the given root. 

184 

185 Parameters 

186 ---------- 

187 root : `str` 

188 URI to the root of the data repository. 

189 config : `Config` 

190 A `Config` to update. Only the subset understood by 

191 this component will be updated. Will not expand 

192 defaults. 

193 full : `Config` 

194 A complete config with all defaults expanded that can be 

195 converted to a `DatastoreConfig`. Read-only and will not be 

196 modified by this method. 

197 Repository-specific options that should not be obtained 

198 from defaults when Butler instances are constructed 

199 should be copied from ``full`` to ``config``. 

200 overwrite : `bool`, optional 

201 If `False`, do not modify a value in ``config`` if the value 

202 already exists. Default is always to overwrite with the provided 

203 ``root``. 

204 

205 Notes 

206 ----- 

207 If a keyword is explicitly defined in the supplied ``config`` it 

208 will not be overridden by this method if ``overwrite`` is `False`. 

209 This allows explicit values set in external configs to be retained. 

210 """ 

211 Config.updateParameters( 

212 DatastoreConfig, 

213 config, 

214 full, 

215 toUpdate={"root": root}, 

216 toCopy=("cls", ("records", "table")), 

217 overwrite=overwrite, 

218 ) 

219 

220 @classmethod 

221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

222 return ddl.TableSpec( 

223 fields=[ 

224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

228 # Use empty string to indicate no component 

229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

230 # TODO: should checksum be Base64Bytes instead? 

231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

233 ], 

234 unique=frozenset(), 

235 indexes=[tuple(["path"])], 

236 ) 

237 

238 def __init__( 

239 self, 

240 config: Union[DatastoreConfig, str], 

241 bridgeManager: DatastoreRegistryBridgeManager, 

242 butlerRoot: str = None, 

243 ): 

244 super().__init__(config, bridgeManager) 

245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 raise ValueError("No root directory specified in configuration") 

247 

248 # Name ourselves either using an explicit name or a name 

249 # derived from the (unexpanded) root 

250 if "name" in self.config: 

251 self.name = self.config["name"] 

252 else: 

253 # We use the unexpanded root in the name to indicate that this 

254 # datastore can be moved without having to update registry. 

255 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

256 

257 # Support repository relocation in config 

258 # Existence of self.root is checked in subclass 

259 self.root = ResourcePath( 

260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

261 ) 

262 

263 self.locationFactory = LocationFactory(self.root) 

264 self.formatterFactory = FormatterFactory() 

265 

266 # Now associate formatters with storage classes 

267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

268 

269 # Read the file naming templates 

270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

271 

272 # See if composites should be disassembled 

273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

274 

275 tableName = self.config["records", "table"] 

276 try: 

277 # Storage of paths and formatters, keyed by dataset_id 

278 self._table = bridgeManager.opaque.register( 

279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

280 ) 

281 # Interface to Registry. 

282 self._bridge = bridgeManager.register(self.name) 

283 except ReadOnlyDatabaseError: 

284 # If the database is read only and we just tried and failed to 

285 # create a table, it means someone is trying to create a read-only 

286 # butler client for an empty repo. That should be okay, as long 

287 # as they then try to get any datasets before some other client 

288 # creates the table. Chances are they'rejust validating 

289 # configuration. 

290 pass 

291 

292 # Determine whether checksums should be used - default to False 

293 self.useChecksum = self.config.get("checksum", False) 

294 

295 # Determine whether we can fall back to configuration if a 

296 # requested dataset is not known to registry 

297 self.trustGetRequest = self.config.get("trust_get_request", False) 

298 

299 # Create a cache manager 

300 self.cacheManager: AbstractDatastoreCacheManager 

301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false

302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

303 else: 

304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

305 

306 # Check existence and create directory structure if necessary 

307 if not self.root.exists(): 

308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true

309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

310 try: 

311 self.root.mkdir() 

312 except Exception as e: 

313 raise ValueError( 

314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

315 ) from e 

316 

317 def __str__(self) -> str: 

318 return str(self.root) 

319 

320 @property 

321 def bridge(self) -> DatastoreRegistryBridge: 

322 return self._bridge 

323 

324 def _artifact_exists(self, location: Location) -> bool: 

325 """Check that an artifact exists in this datastore at the specified 

326 location. 

327 

328 Parameters 

329 ---------- 

330 location : `Location` 

331 Expected location of the artifact associated with this datastore. 

332 

333 Returns 

334 ------- 

335 exists : `bool` 

336 True if the location can be found, false otherwise. 

337 """ 

338 log.debug("Checking if resource exists: %s", location.uri) 

339 return location.uri.exists() 

340 

341 def _delete_artifact(self, location: Location) -> None: 

342 """Delete the artifact from the datastore. 

343 

344 Parameters 

345 ---------- 

346 location : `Location` 

347 Location of the artifact associated with this datastore. 

348 """ 

349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true

350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

351 

352 try: 

353 location.uri.remove() 

354 except FileNotFoundError: 

355 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

356 raise 

357 except Exception as e: 

358 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

359 raise 

360 log.debug("Successfully deleted file: %s", location.uri) 

361 

362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

363 # Docstring inherited from GenericBaseDatastore 

364 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

365 self._table.insert(*records) 

366 

367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

368 # Docstring inherited from GenericBaseDatastore 

369 

370 # Look for the dataset_id -- there might be multiple matches 

371 # if we have disassembled the dataset. 

372 records = self._table.fetch(dataset_id=ref.id) 

373 return [StoredFileInfo.from_record(record) for record in records] 

374 

375 def _get_stored_records_associated_with_refs( 

376 self, refs: Iterable[DatasetIdRef] 

377 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

378 """Retrieve all records associated with the provided refs. 

379 

380 Parameters 

381 ---------- 

382 refs : iterable of `DatasetIdRef` 

383 The refs for which records are to be retrieved. 

384 

385 Returns 

386 ------- 

387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

388 The matching records indexed by the ref ID. The number of entries 

389 in the dict can be smaller than the number of requested refs. 

390 """ 

391 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

392 

393 # Uniqueness is dataset_id + component so can have multiple records 

394 # per ref. 

395 records_by_ref = defaultdict(list) 

396 for record in records: 

397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

398 return records_by_ref 

399 

400 def _refs_associated_with_artifacts( 

401 self, paths: List[Union[str, ResourcePath]] 

402 ) -> Dict[str, Set[DatasetId]]: 

403 """Return paths and associated dataset refs. 

404 

405 Parameters 

406 ---------- 

407 paths : `list` of `str` or `lsst.resources.ResourcePath` 

408 All the paths to include in search. 

409 

410 Returns 

411 ------- 

412 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

413 Mapping of each path to a set of associated database IDs. 

414 """ 

415 records = self._table.fetch(path=[str(path) for path in paths]) 

416 result = defaultdict(set) 

417 for row in records: 

418 result[row["path"]].add(row["dataset_id"]) 

419 return result 

420 

421 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

422 """Return all dataset refs associated with the supplied path. 

423 

424 Parameters 

425 ---------- 

426 pathInStore : `lsst.resources.ResourcePath` 

427 Path of interest in the data store. 

428 

429 Returns 

430 ------- 

431 ids : `set` of `int` 

432 All `DatasetRef` IDs associated with this path. 

433 """ 

434 records = list(self._table.fetch(path=str(pathInStore))) 

435 ids = {r["dataset_id"] for r in records} 

436 return ids 

437 

438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

439 # Docstring inherited from GenericBaseDatastore 

440 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

441 

442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

443 r"""Find all the `Location`\ s of the requested dataset in the 

444 `Datastore` and the associated stored file information. 

445 

446 Parameters 

447 ---------- 

448 ref : `DatasetRef` 

449 Reference to the required `Dataset`. 

450 

451 Returns 

452 ------- 

453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

454 Location of the dataset within the datastore and 

455 stored information about each file and its formatter. 

456 """ 

457 # Get the file information (this will fail if no file) 

458 records = self.getStoredItemsInfo(ref) 

459 

460 # Use the path to determine the location -- we need to take 

461 # into account absolute URIs in the datastore record 

462 return [(r.file_location(self.locationFactory), r) for r in records] 

463 

464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

465 """Check that there is only one dataset associated with the 

466 specified artifact. 

467 

468 Parameters 

469 ---------- 

470 ref : `DatasetRef` or `FakeDatasetRef` 

471 Dataset to be removed. 

472 location : `Location` 

473 The location of the artifact to be removed. 

474 

475 Returns 

476 ------- 

477 can_remove : `Bool` 

478 True if the artifact can be safely removed. 

479 """ 

480 # Can't ever delete absolute URIs. 

481 if location.pathInStore.isabs(): 

482 return False 

483 

484 # Get all entries associated with this path 

485 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

486 if not allRefs: 

487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

488 

489 # Remove these refs from all the refs and if there is nothing left 

490 # then we can delete 

491 remainingRefs = allRefs - {ref.id} 

492 

493 if remainingRefs: 

494 return False 

495 return True 

496 

497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

498 """Predict the location and related file information of the requested 

499 dataset in this datastore. 

500 

501 Parameters 

502 ---------- 

503 ref : `DatasetRef` 

504 Reference to the required `Dataset`. 

505 

506 Returns 

507 ------- 

508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

509 Expected Location of the dataset within the datastore and 

510 placeholder information about each file and its formatter. 

511 

512 Notes 

513 ----- 

514 Uses the current configuration to determine how we would expect the 

515 datastore files to have been written if we couldn't ask registry. 

516 This is safe so long as there has been no change to datastore 

517 configuration between writing the dataset and wanting to read it. 

518 Will not work for files that have been ingested without using the 

519 standard file template or default formatter. 

520 """ 

521 

522 # If we have a component ref we always need to ask the questions 

523 # of the composite. If the composite is disassembled this routine 

524 # should return all components. If the composite was not 

525 # disassembled the composite is what is stored regardless of 

526 # component request. Note that if the caller has disassembled 

527 # a composite there is no way for this guess to know that 

528 # without trying both the composite and component ref and seeing 

529 # if there is something at the component Location even without 

530 # disassembly being enabled. 

531 if ref.datasetType.isComponent(): 

532 ref = ref.makeCompositeRef() 

533 

534 # See if the ref is a composite that should be disassembled 

535 doDisassembly = self.composites.shouldBeDisassembled(ref) 

536 

537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

538 

539 if doDisassembly: 

540 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

541 compRef = ref.makeComponentRef(component) 

542 location, formatter = self._determine_put_formatter_location(compRef) 

543 all_info.append((location, formatter, componentStorage, component)) 

544 

545 else: 

546 # Always use the composite ref if no disassembly 

547 location, formatter = self._determine_put_formatter_location(ref) 

548 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

549 

550 # Convert the list of tuples to have StoredFileInfo as second element 

551 return [ 

552 ( 

553 location, 

554 StoredFileInfo( 

555 formatter=formatter, 

556 path=location.pathInStore.path, 

557 storageClass=storageClass, 

558 component=component, 

559 checksum=None, 

560 file_size=-1, 

561 ), 

562 ) 

563 for location, formatter, storageClass, component in all_info 

564 ] 

565 

566 def _prepare_for_get( 

567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

568 ) -> List[DatastoreFileGetInformation]: 

569 """Check parameters for ``get`` and obtain formatter and 

570 location. 

571 

572 Parameters 

573 ---------- 

574 ref : `DatasetRef` 

575 Reference to the required Dataset. 

576 parameters : `dict` 

577 `StorageClass`-specific parameters that specify, for example, 

578 a slice of the dataset to be loaded. 

579 

580 Returns 

581 ------- 

582 getInfo : `list` [`DatastoreFileGetInformation`] 

583 Parameters needed to retrieve each file. 

584 """ 

585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

586 

587 # Get file metadata and internal metadata 

588 fileLocations = self._get_dataset_locations_info(ref) 

589 if not fileLocations: 

590 if not self.trustGetRequest: 

591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

592 # Assume the dataset is where we think it should be 

593 fileLocations = self._get_expected_dataset_locations_info(ref) 

594 

595 # The storage class we want to use eventually 

596 refStorageClass = ref.datasetType.storageClass 

597 

598 if len(fileLocations) > 1: 

599 disassembled = True 

600 

601 # If trust is involved it is possible that there will be 

602 # components listed here that do not exist in the datastore. 

603 # Explicitly check for file artifact existence and filter out any 

604 # that are missing. 

605 if self.trustGetRequest: 

606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

607 

608 # For now complain only if we have no components at all. One 

609 # component is probably a problem but we can punt that to the 

610 # assembler. 

611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true

612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

613 

614 else: 

615 disassembled = False 

616 

617 # Is this a component request? 

618 refComponent = ref.datasetType.component() 

619 

620 fileGetInfo = [] 

621 for location, storedFileInfo in fileLocations: 

622 

623 # The storage class used to write the file 

624 writeStorageClass = storedFileInfo.storageClass 

625 

626 # If this has been disassembled we need read to match the write 

627 if disassembled: 

628 readStorageClass = writeStorageClass 

629 else: 

630 readStorageClass = refStorageClass 

631 

632 formatter = get_instance_of( 

633 storedFileInfo.formatter, 

634 FileDescriptor( 

635 location, 

636 readStorageClass=readStorageClass, 

637 storageClass=writeStorageClass, 

638 parameters=parameters, 

639 ), 

640 ref.dataId, 

641 ) 

642 

643 formatterParams, notFormatterParams = formatter.segregateParameters() 

644 

645 # Of the remaining parameters, extract the ones supported by 

646 # this StorageClass (for components not all will be handled) 

647 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

648 

649 # The ref itself could be a component if the dataset was 

650 # disassembled by butler, or we disassembled in datastore and 

651 # components came from the datastore records 

652 component = storedFileInfo.component if storedFileInfo.component else refComponent 

653 

654 fileGetInfo.append( 

655 DatastoreFileGetInformation( 

656 location, 

657 formatter, 

658 storedFileInfo, 

659 assemblerParams, 

660 formatterParams, 

661 component, 

662 readStorageClass, 

663 ) 

664 ) 

665 

666 return fileGetInfo 

667 

668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

669 """Check the arguments for ``put`` and obtain formatter and 

670 location. 

671 

672 Parameters 

673 ---------- 

674 inMemoryDataset : `object` 

675 The dataset to store. 

676 ref : `DatasetRef` 

677 Reference to the associated Dataset. 

678 

679 Returns 

680 ------- 

681 location : `Location` 

682 The location to write the dataset. 

683 formatter : `Formatter` 

684 The `Formatter` to use to write the dataset. 

685 

686 Raises 

687 ------ 

688 TypeError 

689 Supplied object and storage class are inconsistent. 

690 DatasetTypeNotSupportedError 

691 The associated `DatasetType` is not handled by this datastore. 

692 """ 

693 self._validate_put_parameters(inMemoryDataset, ref) 

694 return self._determine_put_formatter_location(ref) 

695 

696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

697 """Calculate the formatter and output location to use for put. 

698 

699 Parameters 

700 ---------- 

701 ref : `DatasetRef` 

702 Reference to the associated Dataset. 

703 

704 Returns 

705 ------- 

706 location : `Location` 

707 The location to write the dataset. 

708 formatter : `Formatter` 

709 The `Formatter` to use to write the dataset. 

710 """ 

711 # Work out output file name 

712 try: 

713 template = self.templates.getTemplate(ref) 

714 except KeyError as e: 

715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

716 

717 # Validate the template to protect against filenames from different 

718 # dataIds returning the same and causing overwrite confusion. 

719 template.validateTemplate(ref) 

720 

721 location = self.locationFactory.fromPath(template.format(ref)) 

722 

723 # Get the formatter based on the storage class 

724 storageClass = ref.datasetType.storageClass 

725 try: 

726 formatter = self.formatterFactory.getFormatter( 

727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

728 ) 

729 except KeyError as e: 

730 raise DatasetTypeNotSupportedError( 

731 f"Unable to find formatter for {ref} in datastore {self.name}" 

732 ) from e 

733 

734 # Now that we know the formatter, update the location 

735 location = formatter.makeUpdatedLocation(location) 

736 

737 return location, formatter 

738 

739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

740 # Docstring inherited from base class 

741 if transfer != "auto": 

742 return transfer 

743 

744 # See if the paths are within the datastore or not 

745 inside = [self._pathInStore(d.path) is not None for d in datasets] 

746 

747 if all(inside): 

748 transfer = None 

749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false

750 # Allow ResourcePath to use its own knowledge 

751 transfer = "auto" 

752 else: 

753 # This can happen when importing from a datastore that 

754 # has had some datasets ingested using "direct" mode. 

755 # Also allow ResourcePath to sort it out but warn about it. 

756 # This can happen if you are importing from a datastore 

757 # that had some direct transfer datasets. 

758 log.warning( 

759 "Some datasets are inside the datastore and some are outside. Using 'split' " 

760 "transfer mode. This assumes that the files outside the datastore are " 

761 "still accessible to the new butler since they will not be copied into " 

762 "the target datastore." 

763 ) 

764 transfer = "split" 

765 

766 return transfer 

767 

768 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

769 """Return path relative to datastore root 

770 

771 Parameters 

772 ---------- 

773 path : `lsst.resources.ResourcePathExpression` 

774 Path to dataset. Can be absolute URI. If relative assumed to 

775 be relative to the datastore. Returns path in datastore 

776 or raises an exception if the path it outside. 

777 

778 Returns 

779 ------- 

780 inStore : `str` 

781 Path relative to datastore root. Returns `None` if the file is 

782 outside the root. 

783 """ 

784 # Relative path will always be relative to datastore 

785 pathUri = ResourcePath(path, forceAbsolute=False) 

786 return pathUri.relative_to(self.root) 

787 

788 def _standardizeIngestPath( 

789 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None 

790 ) -> Union[str, ResourcePath]: 

791 """Standardize the path of a to-be-ingested file. 

792 

793 Parameters 

794 ---------- 

795 path : `str` or `lsst.resources.ResourcePath` 

796 Path of a file to be ingested. This parameter is not expected 

797 to be all the types that can be used to construct a 

798 `~lsst.resources.ResourcePath`. 

799 transfer : `str`, optional 

800 How (and whether) the dataset should be added to the datastore. 

801 See `ingest` for details of transfer modes. 

802 This implementation is provided only so 

803 `NotImplementedError` can be raised if the mode is not supported; 

804 actual transfers are deferred to `_extractIngestInfo`. 

805 

806 Returns 

807 ------- 

808 path : `str` or `lsst.resources.ResourcePath` 

809 New path in what the datastore considers standard form. If an 

810 absolute URI was given that will be returned unchanged. 

811 

812 Notes 

813 ----- 

814 Subclasses of `FileDatastore` can implement this method instead 

815 of `_prepIngest`. It should not modify the data repository or given 

816 file in any way. 

817 

818 Raises 

819 ------ 

820 NotImplementedError 

821 Raised if the datastore does not support the given transfer mode 

822 (including the case where ingest is not supported at all). 

823 FileNotFoundError 

824 Raised if one of the given files does not exist. 

825 """ 

826 if transfer not in (None, "direct", "split") + self.root.transferModes: 826 ↛ 827line 826 didn't jump to line 827, because the condition on line 826 was never true

827 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

828 

829 # A relative URI indicates relative to datastore root 

830 srcUri = ResourcePath(path, forceAbsolute=False) 

831 if not srcUri.isabs(): 

832 srcUri = self.root.join(path) 

833 

834 if not srcUri.exists(): 

835 raise FileNotFoundError( 

836 f"Resource at {srcUri} does not exist; note that paths to ingest " 

837 f"are assumed to be relative to {self.root} unless they are absolute." 

838 ) 

839 

840 if transfer is None: 

841 relpath = srcUri.relative_to(self.root) 

842 if not relpath: 

843 raise RuntimeError( 

844 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

845 ) 

846 

847 # Return the relative path within the datastore for internal 

848 # transfer 

849 path = relpath 

850 

851 return path 

852 

853 def _extractIngestInfo( 

854 self, 

855 path: ResourcePathExpression, 

856 ref: DatasetRef, 

857 *, 

858 formatter: Union[Formatter, Type[Formatter]], 

859 transfer: Optional[str] = None, 

860 record_validation_info: bool = True, 

861 ) -> StoredFileInfo: 

862 """Relocate (if necessary) and extract `StoredFileInfo` from a 

863 to-be-ingested file. 

864 

865 Parameters 

866 ---------- 

867 path : `lsst.resources.ResourcePathExpression` 

868 URI or path of a file to be ingested. 

869 ref : `DatasetRef` 

870 Reference for the dataset being ingested. Guaranteed to have 

871 ``dataset_id not None`. 

872 formatter : `type` or `Formatter` 

873 `Formatter` subclass to use for this dataset or an instance. 

874 transfer : `str`, optional 

875 How (and whether) the dataset should be added to the datastore. 

876 See `ingest` for details of transfer modes. 

877 record_validation_info : `bool`, optional 

878 If `True`, the default, the datastore can record validation 

879 information associated with the file. If `False` the datastore 

880 will not attempt to track any information such as checksums 

881 or file sizes. This can be useful if such information is tracked 

882 in an external system or if the file is to be compressed in place. 

883 It is up to the datastore whether this parameter is relevant. 

884 

885 Returns 

886 ------- 

887 info : `StoredFileInfo` 

888 Internal datastore record for this file. This will be inserted by 

889 the caller; the `_extractIngestInfo` is only responsible for 

890 creating and populating the struct. 

891 

892 Raises 

893 ------ 

894 FileNotFoundError 

895 Raised if one of the given files does not exist. 

896 FileExistsError 

897 Raised if transfer is not `None` but the (internal) location the 

898 file would be moved to is already occupied. 

899 """ 

900 if self._transaction is None: 900 ↛ 901line 900 didn't jump to line 901, because the condition on line 900 was never true

901 raise RuntimeError("Ingest called without transaction enabled") 

902 

903 # Create URI of the source path, do not need to force a relative 

904 # path to absolute. 

905 srcUri = ResourcePath(path, forceAbsolute=False) 

906 

907 # Track whether we have read the size of the source yet 

908 have_sized = False 

909 

910 tgtLocation: Optional[Location] 

911 if transfer is None or transfer == "split": 

912 # A relative path is assumed to be relative to the datastore 

913 # in this context 

914 if not srcUri.isabs(): 

915 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

916 else: 

917 # Work out the path in the datastore from an absolute URI 

918 # This is required to be within the datastore. 

919 pathInStore = srcUri.relative_to(self.root) 

920 if pathInStore is None and transfer is None: 920 ↛ 921line 920 didn't jump to line 921, because the condition on line 920 was never true

921 raise RuntimeError( 

922 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

923 ) 

924 if pathInStore: 924 ↛ 926line 924 didn't jump to line 926, because the condition on line 924 was never false

925 tgtLocation = self.locationFactory.fromPath(pathInStore) 

926 elif transfer == "split": 

927 # Outside the datastore but treat that as a direct ingest 

928 # instead. 

929 tgtLocation = None 

930 else: 

931 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

932 elif transfer == "direct": 932 ↛ 937line 932 didn't jump to line 937, because the condition on line 932 was never true

933 # Want to store the full URI to the resource directly in 

934 # datastore. This is useful for referring to permanent archive 

935 # storage for raw data. 

936 # Trust that people know what they are doing. 

937 tgtLocation = None 

938 else: 

939 # Work out the name we want this ingested file to have 

940 # inside the datastore 

941 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

942 if not tgtLocation.uri.dirname().exists(): 

943 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

944 tgtLocation.uri.dirname().mkdir() 

945 

946 # if we are transferring from a local file to a remote location 

947 # it may be more efficient to get the size and checksum of the 

948 # local file rather than the transferred one 

949 if record_validation_info and srcUri.isLocal: 

950 size = srcUri.size() 

951 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

952 have_sized = True 

953 

954 # Transfer the resource to the destination. 

955 # Allow overwrite of an existing file. This matches the behavior 

956 # of datastore.put() in that it trusts that registry would not 

957 # be asking to overwrite unless registry thought that the 

958 # overwrite was allowed. 

959 tgtLocation.uri.transfer_from( 

960 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

961 ) 

962 

963 if tgtLocation is None: 963 ↛ 965line 963 didn't jump to line 965, because the condition on line 963 was never true

964 # This means we are using direct mode 

965 targetUri = srcUri 

966 targetPath = str(srcUri) 

967 else: 

968 targetUri = tgtLocation.uri 

969 targetPath = tgtLocation.pathInStore.path 

970 

971 # the file should exist in the datastore now 

972 if record_validation_info: 

973 if not have_sized: 

974 size = targetUri.size() 

975 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

976 else: 

977 # Not recording any file information. 

978 size = -1 

979 checksum = None 

980 

981 return StoredFileInfo( 

982 formatter=formatter, 

983 path=targetPath, 

984 storageClass=ref.datasetType.storageClass, 

985 component=ref.datasetType.component(), 

986 file_size=size, 

987 checksum=checksum, 

988 ) 

989 

990 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

991 # Docstring inherited from Datastore._prepIngest. 

992 filtered = [] 

993 for dataset in datasets: 

994 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

995 if not acceptable: 

996 continue 

997 else: 

998 dataset.refs = acceptable 

999 if dataset.formatter is None: 

1000 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1001 else: 

1002 assert isinstance(dataset.formatter, (type, str)) 

1003 formatter_class = get_class_of(dataset.formatter) 

1004 if not issubclass(formatter_class, Formatter): 1004 ↛ 1005line 1004 didn't jump to line 1005, because the condition on line 1004 was never true

1005 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1006 dataset.formatter = formatter_class 

1007 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1008 filtered.append(dataset) 

1009 return _IngestPrepData(filtered) 

1010 

1011 @transactional 

1012 def _finishIngest( 

1013 self, 

1014 prepData: Datastore.IngestPrepData, 

1015 *, 

1016 transfer: Optional[str] = None, 

1017 record_validation_info: bool = True, 

1018 ) -> None: 

1019 # Docstring inherited from Datastore._finishIngest. 

1020 refsAndInfos = [] 

1021 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1022 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1023 # Do ingest as if the first dataset ref is associated with the file 

1024 info = self._extractIngestInfo( 

1025 dataset.path, 

1026 dataset.refs[0], 

1027 formatter=dataset.formatter, 

1028 transfer=transfer, 

1029 record_validation_info=record_validation_info, 

1030 ) 

1031 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1032 self._register_datasets(refsAndInfos) 

1033 

1034 def _calculate_ingested_datastore_name( 

1035 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1036 ) -> Location: 

1037 """Given a source URI and a DatasetRef, determine the name the 

1038 dataset will have inside datastore. 

1039 

1040 Parameters 

1041 ---------- 

1042 srcUri : `lsst.resources.ResourcePath` 

1043 URI to the source dataset file. 

1044 ref : `DatasetRef` 

1045 Ref associated with the newly-ingested dataset artifact. This 

1046 is used to determine the name within the datastore. 

1047 formatter : `Formatter` or Formatter class. 

1048 Formatter to use for validation. Can be a class or an instance. 

1049 

1050 Returns 

1051 ------- 

1052 location : `Location` 

1053 Target location for the newly-ingested dataset. 

1054 """ 

1055 # Ingesting a file from outside the datastore. 

1056 # This involves a new name. 

1057 template = self.templates.getTemplate(ref) 

1058 location = self.locationFactory.fromPath(template.format(ref)) 

1059 

1060 # Get the extension 

1061 ext = srcUri.getExtension() 

1062 

1063 # Update the destination to include that extension 

1064 location.updateExtension(ext) 

1065 

1066 # Ask the formatter to validate this extension 

1067 formatter.validateExtension(location) 

1068 

1069 return location 

1070 

1071 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1072 """Write out in memory dataset to datastore. 

1073 

1074 Parameters 

1075 ---------- 

1076 inMemoryDataset : `object` 

1077 Dataset to write to datastore. 

1078 ref : `DatasetRef` 

1079 Registry information associated with this dataset. 

1080 

1081 Returns 

1082 ------- 

1083 info : `StoredFileInfo` 

1084 Information describing the artifact written to the datastore. 

1085 """ 

1086 # May need to coerce the in memory dataset to the correct 

1087 # python type. 

1088 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1089 

1090 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1091 uri = location.uri 

1092 

1093 if not uri.dirname().exists(): 

1094 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1095 uri.dirname().mkdir() 

1096 

1097 if self._transaction is None: 1097 ↛ 1098line 1097 didn't jump to line 1098, because the condition on line 1097 was never true

1098 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1099 

1100 def _removeFileExists(uri: ResourcePath) -> None: 

1101 """Remove a file and do not complain if it is not there. 

1102 

1103 This is important since a formatter might fail before the file 

1104 is written and we should not confuse people by writing spurious 

1105 error messages to the log. 

1106 """ 

1107 try: 

1108 uri.remove() 

1109 except FileNotFoundError: 

1110 pass 

1111 

1112 # Register a callback to try to delete the uploaded data if 

1113 # something fails below 

1114 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1115 

1116 # For a local file, simply use the formatter directly 

1117 if uri.isLocal: 

1118 try: 

1119 formatter.write(inMemoryDataset) 

1120 except Exception as e: 

1121 raise RuntimeError( 

1122 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}" 

1123 ) from e 

1124 log.debug("Successfully wrote python object to local file at %s", uri) 

1125 else: 

1126 # This is a remote URI. Some datasets can be serialized directly 

1127 # to bytes and sent to the remote datastore without writing a 

1128 # file. If the dataset is intended to be saved to the cache 

1129 # a file is always written and direct write to the remote 

1130 # datastore is bypassed. 

1131 data_written = False 

1132 if not self.cacheManager.should_be_cached(ref): 

1133 try: 

1134 serializedDataset = formatter.toBytes(inMemoryDataset) 

1135 except NotImplementedError: 

1136 # Fallback to the file writing option. 

1137 pass 

1138 except Exception as e: 

1139 raise RuntimeError( 

1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1141 ) from e 

1142 else: 

1143 log.debug("Writing bytes directly to %s", uri) 

1144 uri.write(serializedDataset, overwrite=True) 

1145 log.debug("Successfully wrote bytes directly to %s", uri) 

1146 data_written = True 

1147 

1148 if not data_written: 

1149 # Did not write the bytes directly to object store so instead 

1150 # write to temporary file. 

1151 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1152 # Need to configure the formatter to write to a different 

1153 # location and that needs us to overwrite internals 

1154 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1155 with formatter._updateLocation(Location(None, temporary_uri)): 

1156 try: 

1157 formatter.write(inMemoryDataset) 

1158 except Exception as e: 

1159 raise RuntimeError( 

1160 f"Failed to serialize dataset {ref} of type" 

1161 f" {type(inMemoryDataset)} to " 

1162 f"temporary location {temporary_uri}" 

1163 ) from e 

1164 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1165 

1166 # Cache if required 

1167 self.cacheManager.move_to_cache(temporary_uri, ref) 

1168 

1169 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1170 

1171 # URI is needed to resolve what ingest case are we dealing with 

1172 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1173 

1174 def _read_artifact_into_memory( 

1175 self, 

1176 getInfo: DatastoreFileGetInformation, 

1177 ref: DatasetRef, 

1178 isComponent: bool = False, 

1179 cache_ref: Optional[DatasetRef] = None, 

1180 ) -> Any: 

1181 """Read the artifact from datastore into in memory object. 

1182 

1183 Parameters 

1184 ---------- 

1185 getInfo : `DatastoreFileGetInformation` 

1186 Information about the artifact within the datastore. 

1187 ref : `DatasetRef` 

1188 The registry information associated with this artifact. 

1189 isComponent : `bool` 

1190 Flag to indicate if a component is being read from this artifact. 

1191 cache_ref : `DatasetRef`, optional 

1192 The DatasetRef to use when looking up the file in the cache. 

1193 This ref must have the same ID as the supplied ref but can 

1194 be a parent ref or component ref to indicate to the cache whether 

1195 a composite file is being requested from the cache or a component 

1196 file. Without this the cache will default to the supplied ref but 

1197 it can get confused with read-only derived components for 

1198 disassembled composites. 

1199 

1200 Returns 

1201 ------- 

1202 inMemoryDataset : `object` 

1203 The artifact as a python object. 

1204 """ 

1205 location = getInfo.location 

1206 uri = location.uri 

1207 log.debug("Accessing data from %s", uri) 

1208 

1209 if cache_ref is None: 

1210 cache_ref = ref 

1211 if cache_ref.id != ref.id: 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true

1212 raise ValueError( 

1213 "The supplied cache dataset ref refers to a different dataset than expected:" 

1214 f" {ref.id} != {cache_ref.id}" 

1215 ) 

1216 

1217 # Cannot recalculate checksum but can compare size as a quick check 

1218 # Do not do this if the size is negative since that indicates 

1219 # we do not know. 

1220 recorded_size = getInfo.info.file_size 

1221 resource_size = uri.size() 

1222 if recorded_size >= 0 and resource_size != recorded_size: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true

1223 raise RuntimeError( 

1224 "Integrity failure in Datastore. " 

1225 f"Size of file {uri} ({resource_size}) " 

1226 f"does not match size recorded in registry of {recorded_size}" 

1227 ) 

1228 

1229 # For the general case we have choices for how to proceed. 

1230 # 1. Always use a local file (downloading the remote resource to a 

1231 # temporary file if needed). 

1232 # 2. Use a threshold size and read into memory and use bytes. 

1233 # Use both for now with an arbitrary hand off size. 

1234 # This allows small datasets to be downloaded from remote object 

1235 # stores without requiring a temporary file. 

1236 

1237 formatter = getInfo.formatter 

1238 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1239 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1240 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1241 if cached_file is not None: 

1242 desired_uri = cached_file 

1243 msg = f" (cached version of {uri})" 

1244 else: 

1245 desired_uri = uri 

1246 msg = "" 

1247 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1248 serializedDataset = desired_uri.read() 

1249 log.debug( 

1250 "Deserializing %s from %d bytes from location %s with formatter %s", 

1251 f"component {getInfo.component}" if isComponent else "", 

1252 len(serializedDataset), 

1253 uri, 

1254 formatter.name(), 

1255 ) 

1256 try: 

1257 result = formatter.fromBytes( 

1258 serializedDataset, component=getInfo.component if isComponent else None 

1259 ) 

1260 except Exception as e: 

1261 raise ValueError( 

1262 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1263 f" ({ref.datasetType.name} from {uri}): {e}" 

1264 ) from e 

1265 else: 

1266 # Read from file. 

1267 

1268 # Have to update the Location associated with the formatter 

1269 # because formatter.read does not allow an override. 

1270 # This could be improved. 

1271 location_updated = False 

1272 msg = "" 

1273 

1274 # First check in cache for local version. 

1275 # The cache will only be relevant for remote resources but 

1276 # no harm in always asking. Context manager ensures that cache 

1277 # file is not deleted during cache expiration. 

1278 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1279 if cached_file is not None: 

1280 msg = f"(via cache read of remote file {uri})" 

1281 uri = cached_file 

1282 location_updated = True 

1283 

1284 with uri.as_local() as local_uri: 

1285 

1286 can_be_cached = False 

1287 if uri != local_uri: 1287 ↛ 1289line 1287 didn't jump to line 1289, because the condition on line 1287 was never true

1288 # URI was remote and file was downloaded 

1289 cache_msg = "" 

1290 location_updated = True 

1291 

1292 if self.cacheManager.should_be_cached(cache_ref): 

1293 # In this scenario we want to ask if the downloaded 

1294 # file should be cached but we should not cache 

1295 # it until after we've used it (to ensure it can't 

1296 # be expired whilst we are using it). 

1297 can_be_cached = True 

1298 

1299 # Say that it is "likely" to be cached because 

1300 # if the formatter read fails we will not be 

1301 # caching this file. 

1302 cache_msg = " and likely cached" 

1303 

1304 msg = f"(via download to local file{cache_msg})" 

1305 

1306 # Calculate the (possibly) new location for the formatter 

1307 # to use. 

1308 newLocation = Location(*local_uri.split()) if location_updated else None 

1309 

1310 log.debug( 

1311 "Reading%s from location %s %s with formatter %s", 

1312 f" component {getInfo.component}" if isComponent else "", 

1313 uri, 

1314 msg, 

1315 formatter.name(), 

1316 ) 

1317 try: 

1318 with formatter._updateLocation(newLocation): 

1319 with time_this( 

1320 log, 

1321 msg="Reading%s from location %s %s with formatter %s", 

1322 args=( 

1323 f" component {getInfo.component}" if isComponent else "", 

1324 uri, 

1325 msg, 

1326 formatter.name(), 

1327 ), 

1328 ): 

1329 result = formatter.read(component=getInfo.component if isComponent else None) 

1330 except Exception as e: 

1331 raise ValueError( 

1332 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1333 f" ({ref.datasetType.name} from {uri}): {e}" 

1334 ) from e 

1335 

1336 # File was read successfully so can move to cache 

1337 if can_be_cached: 1337 ↛ 1338line 1337 didn't jump to line 1338, because the condition on line 1337 was never true

1338 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1339 

1340 return self._post_process_get( 

1341 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1342 ) 

1343 

1344 def knows(self, ref: DatasetRef) -> bool: 

1345 """Check if the dataset is known to the datastore. 

1346 

1347 Does not check for existence of any artifact. 

1348 

1349 Parameters 

1350 ---------- 

1351 ref : `DatasetRef` 

1352 Reference to the required dataset. 

1353 

1354 Returns 

1355 ------- 

1356 exists : `bool` 

1357 `True` if the dataset is known to the datastore. 

1358 """ 

1359 fileLocations = self._get_dataset_locations_info(ref) 

1360 if fileLocations: 

1361 return True 

1362 return False 

1363 

1364 def _process_mexists_records( 

1365 self, 

1366 id_to_ref: Dict[DatasetId, DatasetRef], 

1367 records: Dict[DatasetId, List[StoredFileInfo]], 

1368 all_required: bool, 

1369 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1370 ) -> Dict[DatasetRef, bool]: 

1371 """Helper function for mexists that checks the given records. 

1372 

1373 Parameters 

1374 ---------- 

1375 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1376 Mapping of the dataset ID to the dataset ref itself. 

1377 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1378 Records as generally returned by 

1379 ``_get_stored_records_associated_with_refs``. 

1380 all_required : `bool` 

1381 Flag to indicate whether existence requires all artifacts 

1382 associated with a dataset ID to exist or not for existence. 

1383 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1384 Optional mapping of datastore artifact to existence. Updated by 

1385 this method with details of all artifacts tested. Can be `None` 

1386 if the caller is not interested. 

1387 

1388 Returns 

1389 ------- 

1390 existence : `dict` of [`DatasetRef`, `bool`] 

1391 Mapping from dataset to boolean indicating existence. 

1392 """ 

1393 # The URIs to be checked and a mapping of those URIs to 

1394 # the dataset ID. 

1395 uris_to_check: List[ResourcePath] = [] 

1396 location_map: Dict[ResourcePath, DatasetId] = {} 

1397 

1398 location_factory = self.locationFactory 

1399 

1400 uri_existence: Dict[ResourcePath, bool] = {} 

1401 for ref_id, infos in records.items(): 

1402 # Key is the dataset Id, value is list of StoredItemInfo 

1403 uris = [info.file_location(location_factory).uri for info in infos] 

1404 location_map.update({uri: ref_id for uri in uris}) 

1405 

1406 # Check the local cache directly for a dataset corresponding 

1407 # to the remote URI. 

1408 if self.cacheManager.file_count > 0: 

1409 ref = id_to_ref[ref_id] 

1410 for uri, storedFileInfo in zip(uris, infos): 

1411 check_ref = ref 

1412 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 1412 ↛ 1413line 1412 didn't jump to line 1413, because the condition on line 1412 was never true

1413 check_ref = ref.makeComponentRef(component) 

1414 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1415 # Proxy for URI existence. 

1416 uri_existence[uri] = True 

1417 else: 

1418 uris_to_check.append(uri) 

1419 else: 

1420 # Check all of them. 

1421 uris_to_check.extend(uris) 

1422 

1423 if artifact_existence is not None: 

1424 # If a URI has already been checked remove it from the list 

1425 # and immediately add the status to the output dict. 

1426 filtered_uris_to_check = [] 

1427 for uri in uris_to_check: 

1428 if uri in artifact_existence: 

1429 uri_existence[uri] = artifact_existence[uri] 

1430 else: 

1431 filtered_uris_to_check.append(uri) 

1432 uris_to_check = filtered_uris_to_check 

1433 

1434 # Results. 

1435 dataset_existence: Dict[DatasetRef, bool] = {} 

1436 

1437 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1438 for uri, exists in uri_existence.items(): 

1439 dataset_id = location_map[uri] 

1440 ref = id_to_ref[dataset_id] 

1441 

1442 # Disassembled composite needs to check all locations. 

1443 # all_required indicates whether all need to exist or not. 

1444 if ref in dataset_existence: 

1445 if all_required: 

1446 exists = dataset_existence[ref] and exists 

1447 else: 

1448 exists = dataset_existence[ref] or exists 

1449 dataset_existence[ref] = exists 

1450 

1451 if artifact_existence is not None: 

1452 artifact_existence.update(uri_existence) 

1453 

1454 return dataset_existence 

1455 

1456 def mexists( 

1457 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1458 ) -> Dict[DatasetRef, bool]: 

1459 """Check the existence of multiple datasets at once. 

1460 

1461 Parameters 

1462 ---------- 

1463 refs : iterable of `DatasetRef` 

1464 The datasets to be checked. 

1465 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1466 Optional mapping of datastore artifact to existence. Updated by 

1467 this method with details of all artifacts tested. Can be `None` 

1468 if the caller is not interested. 

1469 

1470 Returns 

1471 ------- 

1472 existence : `dict` of [`DatasetRef`, `bool`] 

1473 Mapping from dataset to boolean indicating existence. 

1474 

1475 Notes 

1476 ----- 

1477 To minimize potentially costly remote existence checks, the local 

1478 cache is checked as a proxy for existence. If a file for this 

1479 `DatasetRef` does exist no check is done for the actual URI. This 

1480 could result in possibly unexpected behavior if the dataset itself 

1481 has been removed from the datastore by another process whilst it is 

1482 still in the cache. 

1483 """ 

1484 chunk_size = 10_000 

1485 dataset_existence: Dict[DatasetRef, bool] = {} 

1486 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1487 n_found_total = 0 

1488 n_checked = 0 

1489 n_chunks = 0 

1490 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1491 chunk_result = self._mexists(chunk, artifact_existence) 

1492 if log.isEnabledFor(VERBOSE): 

1493 n_results = len(chunk_result) 

1494 n_checked += n_results 

1495 # Can treat the booleans as 0, 1 integers and sum them. 

1496 n_found = sum(chunk_result.values()) 

1497 n_found_total += n_found 

1498 log.verbose( 

1499 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1500 n_chunks, 

1501 n_found, 

1502 n_results, 

1503 n_found_total, 

1504 n_checked, 

1505 ) 

1506 dataset_existence.update(chunk_result) 

1507 n_chunks += 1 

1508 

1509 return dataset_existence 

1510 

1511 def _mexists( 

1512 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1513 ) -> Dict[DatasetRef, bool]: 

1514 """Check the existence of multiple datasets at once. 

1515 

1516 Parameters 

1517 ---------- 

1518 refs : iterable of `DatasetRef` 

1519 The datasets to be checked. 

1520 

1521 Returns 

1522 ------- 

1523 existence : `dict` of [`DatasetRef`, `bool`] 

1524 Mapping from dataset to boolean indicating existence. 

1525 """ 

1526 # Need a mapping of dataset_id to dataset ref since the API 

1527 # works with dataset_id 

1528 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1529 

1530 # Set of all IDs we are checking for. 

1531 requested_ids = set(id_to_ref.keys()) 

1532 

1533 # The records themselves. Could be missing some entries. 

1534 records = self._get_stored_records_associated_with_refs(refs) 

1535 

1536 dataset_existence = self._process_mexists_records( 

1537 id_to_ref, records, True, artifact_existence=artifact_existence 

1538 ) 

1539 

1540 # Set of IDs that have been handled. 

1541 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1542 

1543 missing_ids = requested_ids - handled_ids 

1544 if missing_ids: 

1545 if not self.trustGetRequest: 

1546 # Must assume these do not exist 

1547 for missing in missing_ids: 

1548 dataset_existence[id_to_ref[missing]] = False 

1549 else: 

1550 log.debug( 

1551 "%d out of %d datasets were not known to datastore during initial existence check.", 

1552 len(missing_ids), 

1553 len(requested_ids), 

1554 ) 

1555 

1556 # Construct data structure identical to that returned 

1557 # by _get_stored_records_associated_with_refs() but using 

1558 # guessed names. 

1559 records = {} 

1560 for missing in missing_ids: 

1561 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1562 records[missing] = [info for _, info in expected] 

1563 

1564 dataset_existence.update( 

1565 self._process_mexists_records( 

1566 id_to_ref, records, False, artifact_existence=artifact_existence 

1567 ) 

1568 ) 

1569 

1570 return dataset_existence 

1571 

1572 def exists(self, ref: DatasetRef) -> bool: 

1573 """Check if the dataset exists in the datastore. 

1574 

1575 Parameters 

1576 ---------- 

1577 ref : `DatasetRef` 

1578 Reference to the required dataset. 

1579 

1580 Returns 

1581 ------- 

1582 exists : `bool` 

1583 `True` if the entity exists in the `Datastore`. 

1584 

1585 Notes 

1586 ----- 

1587 The local cache is checked as a proxy for existence in the remote 

1588 object store. It is possible that another process on a different 

1589 compute node could remove the file from the object store even 

1590 though it is present in the local cache. 

1591 """ 

1592 fileLocations = self._get_dataset_locations_info(ref) 

1593 

1594 # if we are being asked to trust that registry might not be correct 

1595 # we ask for the expected locations and check them explicitly 

1596 if not fileLocations: 

1597 if not self.trustGetRequest: 

1598 return False 

1599 

1600 # First check the cache. If it is not found we must check 

1601 # the datastore itself. Assume that any component in the cache 

1602 # means that the dataset does exist somewhere. 

1603 if self.cacheManager.known_to_cache(ref): 1603 ↛ 1604line 1603 didn't jump to line 1604, because the condition on line 1603 was never true

1604 return True 

1605 

1606 # When we are guessing a dataset location we can not check 

1607 # for the existence of every component since we can not 

1608 # know if every component was written. Instead we check 

1609 # for the existence of any of the expected locations. 

1610 for location, _ in self._get_expected_dataset_locations_info(ref): 1610 ↛ 1613line 1610 didn't jump to line 1613, because the loop on line 1610 didn't complete

1611 if self._artifact_exists(location): 1611 ↛ 1610line 1611 didn't jump to line 1610, because the condition on line 1611 was never false

1612 return True 

1613 return False 

1614 

1615 # All listed artifacts must exist. 

1616 for location, storedFileInfo in fileLocations: 

1617 # Checking in cache needs the component ref. 

1618 check_ref = ref 

1619 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1620 check_ref = ref.makeComponentRef(component) 

1621 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1622 continue 

1623 

1624 if not self._artifact_exists(location): 

1625 return False 

1626 

1627 return True 

1628 

1629 def getURIs( 

1630 self, ref: DatasetRef, predict: bool = False 

1631 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1632 """Return URIs associated with dataset. 

1633 

1634 Parameters 

1635 ---------- 

1636 ref : `DatasetRef` 

1637 Reference to the required dataset. 

1638 predict : `bool`, optional 

1639 If the datastore does not know about the dataset, should it 

1640 return a predicted URI or not? 

1641 

1642 Returns 

1643 ------- 

1644 primary : `lsst.resources.ResourcePath` 

1645 The URI to the primary artifact associated with this dataset. 

1646 If the dataset was disassembled within the datastore this 

1647 may be `None`. 

1648 components : `dict` 

1649 URIs to any components associated with the dataset artifact. 

1650 Can be empty if there are no components. 

1651 """ 

1652 

1653 primary: Optional[ResourcePath] = None 

1654 components: Dict[str, ResourcePath] = {} 

1655 

1656 # if this has never been written then we have to guess 

1657 if not self.exists(ref): 

1658 if not predict: 

1659 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1660 

1661 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1662 

1663 if doDisassembly: 

1664 

1665 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1666 compRef = ref.makeComponentRef(component) 

1667 compLocation, _ = self._determine_put_formatter_location(compRef) 

1668 

1669 # Add a URI fragment to indicate this is a guess 

1670 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted") 

1671 

1672 else: 

1673 

1674 location, _ = self._determine_put_formatter_location(ref) 

1675 

1676 # Add a URI fragment to indicate this is a guess 

1677 primary = ResourcePath(location.uri.geturl() + "#predicted") 

1678 

1679 return primary, components 

1680 

1681 # If this is a ref that we have written we can get the path. 

1682 # Get file metadata and internal metadata 

1683 fileLocations = self._get_dataset_locations_info(ref) 

1684 

1685 guessing = False 

1686 if not fileLocations: 

1687 if not self.trustGetRequest: 1687 ↛ 1688line 1687 didn't jump to line 1688, because the condition on line 1687 was never true

1688 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1689 fileLocations = self._get_expected_dataset_locations_info(ref) 

1690 guessing = True 

1691 

1692 if len(fileLocations) == 1: 

1693 # No disassembly so this is the primary URI 

1694 uri = fileLocations[0][0].uri 

1695 if guessing and not uri.exists(): 1695 ↛ 1696line 1695 didn't jump to line 1696, because the condition on line 1695 was never true

1696 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1697 primary = uri 

1698 

1699 else: 

1700 for location, storedFileInfo in fileLocations: 

1701 if storedFileInfo.component is None: 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true

1702 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1703 uri = location.uri 

1704 if guessing and not uri.exists(): 1704 ↛ 1708line 1704 didn't jump to line 1708, because the condition on line 1704 was never true

1705 # If we are trusting then it is entirely possible for 

1706 # some components to be missing. In that case we skip 

1707 # to the next component. 

1708 if self.trustGetRequest: 

1709 continue 

1710 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1711 components[storedFileInfo.component] = uri 

1712 

1713 return primary, components 

1714 

1715 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1716 """URI to the Dataset. 

1717 

1718 Parameters 

1719 ---------- 

1720 ref : `DatasetRef` 

1721 Reference to the required Dataset. 

1722 predict : `bool` 

1723 If `True`, allow URIs to be returned of datasets that have not 

1724 been written. 

1725 

1726 Returns 

1727 ------- 

1728 uri : `str` 

1729 URI pointing to the dataset within the datastore. If the 

1730 dataset does not exist in the datastore, and if ``predict`` is 

1731 `True`, the URI will be a prediction and will include a URI 

1732 fragment "#predicted". 

1733 If the datastore does not have entities that relate well 

1734 to the concept of a URI the returned URI will be 

1735 descriptive. The returned URI is not guaranteed to be obtainable. 

1736 

1737 Raises 

1738 ------ 

1739 FileNotFoundError 

1740 Raised if a URI has been requested for a dataset that does not 

1741 exist and guessing is not allowed. 

1742 RuntimeError 

1743 Raised if a request is made for a single URI but multiple URIs 

1744 are associated with this dataset. 

1745 

1746 Notes 

1747 ----- 

1748 When a predicted URI is requested an attempt will be made to form 

1749 a reasonable URI based on file templates and the expected formatter. 

1750 """ 

1751 primary, components = self.getURIs(ref, predict) 

1752 if primary is None or components: 1752 ↛ 1753line 1752 didn't jump to line 1753, because the condition on line 1752 was never true

1753 raise RuntimeError( 

1754 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1755 ) 

1756 return primary 

1757 

1758 def retrieveArtifacts( 

1759 self, 

1760 refs: Iterable[DatasetRef], 

1761 destination: ResourcePath, 

1762 transfer: str = "auto", 

1763 preserve_path: bool = True, 

1764 overwrite: bool = False, 

1765 ) -> List[ResourcePath]: 

1766 """Retrieve the file artifacts associated with the supplied refs. 

1767 

1768 Parameters 

1769 ---------- 

1770 refs : iterable of `DatasetRef` 

1771 The datasets for which file artifacts are to be retrieved. 

1772 A single ref can result in multiple files. The refs must 

1773 be resolved. 

1774 destination : `lsst.resources.ResourcePath` 

1775 Location to write the file artifacts. 

1776 transfer : `str`, optional 

1777 Method to use to transfer the artifacts. Must be one of the options 

1778 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1779 "move" is not allowed. 

1780 preserve_path : `bool`, optional 

1781 If `True` the full path of the file artifact within the datastore 

1782 is preserved. If `False` the final file component of the path 

1783 is used. 

1784 overwrite : `bool`, optional 

1785 If `True` allow transfers to overwrite existing files at the 

1786 destination. 

1787 

1788 Returns 

1789 ------- 

1790 targets : `list` of `lsst.resources.ResourcePath` 

1791 URIs of file artifacts in destination location. Order is not 

1792 preserved. 

1793 """ 

1794 if not destination.isdir(): 1794 ↛ 1795line 1794 didn't jump to line 1795, because the condition on line 1794 was never true

1795 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1796 

1797 if transfer == "move": 

1798 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1799 

1800 # Source -> Destination 

1801 # This also helps filter out duplicate DatasetRef in the request 

1802 # that will map to the same underlying file transfer. 

1803 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1804 

1805 for ref in refs: 

1806 locations = self._get_dataset_locations_info(ref) 

1807 for location, _ in locations: 

1808 source_uri = location.uri 

1809 target_path: ResourcePathExpression 

1810 if preserve_path: 

1811 target_path = location.pathInStore 

1812 if target_path.isabs(): 1812 ↛ 1815line 1812 didn't jump to line 1815, because the condition on line 1812 was never true

1813 # This is an absolute path to an external file. 

1814 # Use the full path. 

1815 target_path = target_path.relativeToPathRoot 

1816 else: 

1817 target_path = source_uri.basename() 

1818 target_uri = destination.join(target_path) 

1819 to_transfer[source_uri] = target_uri 

1820 

1821 # In theory can now parallelize the transfer 

1822 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1823 for source_uri, target_uri in to_transfer.items(): 

1824 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1825 

1826 return list(to_transfer.values()) 

1827 

1828 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1829 """Load an InMemoryDataset from the store. 

1830 

1831 Parameters 

1832 ---------- 

1833 ref : `DatasetRef` 

1834 Reference to the required Dataset. 

1835 parameters : `dict` 

1836 `StorageClass`-specific parameters that specify, for example, 

1837 a slice of the dataset to be loaded. 

1838 

1839 Returns 

1840 ------- 

1841 inMemoryDataset : `object` 

1842 Requested dataset or slice thereof as an InMemoryDataset. 

1843 

1844 Raises 

1845 ------ 

1846 FileNotFoundError 

1847 Requested dataset can not be retrieved. 

1848 TypeError 

1849 Return value from formatter has unexpected type. 

1850 ValueError 

1851 Formatter failed to process the dataset. 

1852 """ 

1853 allGetInfo = self._prepare_for_get(ref, parameters) 

1854 refComponent = ref.datasetType.component() 

1855 

1856 # Supplied storage class for the component being read 

1857 refStorageClass = ref.datasetType.storageClass 

1858 

1859 # Create mapping from component name to related info 

1860 allComponents = {i.component: i for i in allGetInfo} 

1861 

1862 # By definition the dataset is disassembled if we have more 

1863 # than one record for it. 

1864 isDisassembled = len(allGetInfo) > 1 

1865 

1866 # Look for the special case where we are disassembled but the 

1867 # component is a derived component that was not written during 

1868 # disassembly. For this scenario we need to check that the 

1869 # component requested is listed as a derived component for the 

1870 # composite storage class 

1871 isDisassembledReadOnlyComponent = False 

1872 if isDisassembled and refComponent: 

1873 # The composite storage class should be accessible through 

1874 # the component dataset type 

1875 compositeStorageClass = ref.datasetType.parentStorageClass 

1876 

1877 # In the unlikely scenario where the composite storage 

1878 # class is not known, we can only assume that this is a 

1879 # normal component. If that assumption is wrong then the 

1880 # branch below that reads a persisted component will fail 

1881 # so there is no need to complain here. 

1882 if compositeStorageClass is not None: 1882 ↛ 1885line 1882 didn't jump to line 1885, because the condition on line 1882 was never false

1883 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1884 

1885 if isDisassembled and not refComponent: 

1886 # This was a disassembled dataset spread over multiple files 

1887 # and we need to put them all back together again. 

1888 # Read into memory and then assemble 

1889 

1890 # Check that the supplied parameters are suitable for the type read 

1891 refStorageClass.validateParameters(parameters) 

1892 

1893 # We want to keep track of all the parameters that were not used 

1894 # by formatters. We assume that if any of the component formatters 

1895 # use a parameter that we do not need to apply it again in the 

1896 # assembler. 

1897 usedParams = set() 

1898 

1899 components: Dict[str, Any] = {} 

1900 for getInfo in allGetInfo: 

1901 # assemblerParams are parameters not understood by the 

1902 # associated formatter. 

1903 usedParams.update(set(getInfo.formatterParams)) 

1904 

1905 component = getInfo.component 

1906 

1907 if component is None: 1907 ↛ 1908line 1907 didn't jump to line 1908, because the condition on line 1907 was never true

1908 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1909 

1910 # We do not want the formatter to think it's reading 

1911 # a component though because it is really reading a 

1912 # standalone dataset -- always tell reader it is not a 

1913 # component. 

1914 components[component] = self._read_artifact_into_memory( 

1915 getInfo, ref.makeComponentRef(component), isComponent=False 

1916 ) 

1917 

1918 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1919 

1920 # Any unused parameters will have to be passed to the assembler 

1921 if parameters: 

1922 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1923 else: 

1924 unusedParams = {} 

1925 

1926 # Process parameters 

1927 return ref.datasetType.storageClass.delegate().handleParameters( 

1928 inMemoryDataset, parameters=unusedParams 

1929 ) 

1930 

1931 elif isDisassembledReadOnlyComponent: 

1932 

1933 compositeStorageClass = ref.datasetType.parentStorageClass 

1934 if compositeStorageClass is None: 1934 ↛ 1935line 1934 didn't jump to line 1935, because the condition on line 1934 was never true

1935 raise RuntimeError( 

1936 f"Unable to retrieve derived component '{refComponent}' since" 

1937 "no composite storage class is available." 

1938 ) 

1939 

1940 if refComponent is None: 1940 ↛ 1942line 1940 didn't jump to line 1942, because the condition on line 1940 was never true

1941 # Mainly for mypy 

1942 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1943 

1944 # Assume that every derived component can be calculated by 

1945 # forwarding the request to a single read/write component. 

1946 # Rather than guessing which rw component is the right one by 

1947 # scanning each for a derived component of the same name, 

1948 # we ask the storage class delegate directly which one is best to 

1949 # use. 

1950 compositeDelegate = compositeStorageClass.delegate() 

1951 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

1952 refComponent, set(allComponents) 

1953 ) 

1954 

1955 # Select the relevant component 

1956 rwInfo = allComponents[forwardedComponent] 

1957 

1958 # For now assume that read parameters are validated against 

1959 # the real component and not the requested component 

1960 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1961 forwardedStorageClass.validateParameters(parameters) 

1962 

1963 # The reference to use for the caching must refer to the forwarded 

1964 # component and not the derived component. 

1965 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1966 

1967 # Unfortunately the FileDescriptor inside the formatter will have 

1968 # the wrong write storage class so we need to create a new one 

1969 # given the immutability constraint. 

1970 writeStorageClass = rwInfo.info.storageClass 

1971 

1972 # We may need to put some thought into parameters for read 

1973 # components but for now forward them on as is 

1974 readFormatter = type(rwInfo.formatter)( 

1975 FileDescriptor( 

1976 rwInfo.location, 

1977 readStorageClass=refStorageClass, 

1978 storageClass=writeStorageClass, 

1979 parameters=parameters, 

1980 ), 

1981 ref.dataId, 

1982 ) 

1983 

1984 # The assembler can not receive any parameter requests for a 

1985 # derived component at this time since the assembler will 

1986 # see the storage class of the derived component and those 

1987 # parameters will have to be handled by the formatter on the 

1988 # forwarded storage class. 

1989 assemblerParams: Dict[str, Any] = {} 

1990 

1991 # Need to created a new info that specifies the derived 

1992 # component and associated storage class 

1993 readInfo = DatastoreFileGetInformation( 

1994 rwInfo.location, 

1995 readFormatter, 

1996 rwInfo.info, 

1997 assemblerParams, 

1998 {}, 

1999 refComponent, 

2000 refStorageClass, 

2001 ) 

2002 

2003 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2004 

2005 else: 

2006 # Single file request or component from that composite file 

2007 for lookup in (refComponent, None): 2007 ↛ 2012line 2007 didn't jump to line 2012, because the loop on line 2007 didn't complete

2008 if lookup in allComponents: 2008 ↛ 2007line 2008 didn't jump to line 2007, because the condition on line 2008 was never false

2009 getInfo = allComponents[lookup] 

2010 break 

2011 else: 

2012 raise FileNotFoundError( 

2013 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2014 ) 

2015 

2016 # Do not need the component itself if already disassembled 

2017 if isDisassembled: 

2018 isComponent = False 

2019 else: 

2020 isComponent = getInfo.component is not None 

2021 

2022 # For a component read of a composite we want the cache to 

2023 # be looking at the composite ref itself. 

2024 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2025 

2026 # For a disassembled component we can validate parametersagainst 

2027 # the component storage class directly 

2028 if isDisassembled: 

2029 refStorageClass.validateParameters(parameters) 

2030 else: 

2031 # For an assembled composite this could be a derived 

2032 # component derived from a real component. The validity 

2033 # of the parameters is not clear. For now validate against 

2034 # the composite storage class 

2035 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2036 

2037 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2038 

2039 @transactional 

2040 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2041 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2042 

2043 Parameters 

2044 ---------- 

2045 inMemoryDataset : `object` 

2046 The dataset to store. 

2047 ref : `DatasetRef` 

2048 Reference to the associated Dataset. 

2049 

2050 Raises 

2051 ------ 

2052 TypeError 

2053 Supplied object and storage class are inconsistent. 

2054 DatasetTypeNotSupportedError 

2055 The associated `DatasetType` is not handled by this datastore. 

2056 

2057 Notes 

2058 ----- 

2059 If the datastore is configured to reject certain dataset types it 

2060 is possible that the put will fail and raise a 

2061 `DatasetTypeNotSupportedError`. The main use case for this is to 

2062 allow `ChainedDatastore` to put to multiple datastores without 

2063 requiring that every datastore accepts the dataset. 

2064 """ 

2065 

2066 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2067 # doDisassembly = True 

2068 

2069 artifacts = [] 

2070 if doDisassembly: 

2071 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2072 if components is None: 2072 ↛ 2073line 2072 didn't jump to line 2073, because the condition on line 2072 was never true

2073 raise RuntimeError( 

2074 f"Inconsistent configuration: dataset type {ref.datasetType.name} " 

2075 f"with storage class {ref.datasetType.storageClass.name} " 

2076 "is configured to be disassembled, but cannot be." 

2077 ) 

2078 for component, componentInfo in components.items(): 

2079 # Don't recurse because we want to take advantage of 

2080 # bulk insert -- need a new DatasetRef that refers to the 

2081 # same dataset_id but has the component DatasetType 

2082 # DatasetType does not refer to the types of components 

2083 # So we construct one ourselves. 

2084 compRef = ref.makeComponentRef(component) 

2085 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2086 artifacts.append((compRef, storedInfo)) 

2087 else: 

2088 # Write the entire thing out 

2089 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2090 artifacts.append((ref, storedInfo)) 

2091 

2092 self._register_datasets(artifacts) 

2093 

2094 @transactional 

2095 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2096 # At this point can safely remove these datasets from the cache 

2097 # to avoid confusion later on. If they are not trashed later 

2098 # the cache will simply be refilled. 

2099 self.cacheManager.remove_from_cache(ref) 

2100 

2101 # If we are in trust mode there will be nothing to move to 

2102 # the trash table and we will have to try to delete the file 

2103 # immediately. 

2104 if self.trustGetRequest: 

2105 # Try to keep the logic below for a single file trash. 

2106 if isinstance(ref, DatasetRef): 

2107 refs = {ref} 

2108 else: 

2109 # Will recreate ref at the end of this branch. 

2110 refs = set(ref) 

2111 

2112 # Determine which datasets are known to datastore directly. 

2113 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2114 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2115 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2116 

2117 missing = refs - existing_refs 

2118 if missing: 

2119 # Do an explicit existence check on these refs. 

2120 # We only care about the artifacts at this point and not 

2121 # the dataset existence. 

2122 artifact_existence: Dict[ResourcePath, bool] = {} 

2123 _ = self.mexists(missing, artifact_existence) 

2124 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2125 

2126 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2127 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2128 for uri in uris: 

2129 try: 

2130 uri.remove() 

2131 except Exception as e: 

2132 if ignore_errors: 

2133 log.debug("Artifact %s could not be removed: %s", uri, e) 

2134 continue 

2135 raise 

2136 

2137 # There is no point asking the code below to remove refs we 

2138 # know are missing so update it with the list of existing 

2139 # records. Try to retain one vs many logic. 

2140 if not existing_refs: 

2141 # Nothing more to do since none of the datasets were 

2142 # known to the datastore record table. 

2143 return 

2144 ref = list(existing_refs) 

2145 if len(ref) == 1: 

2146 ref = ref[0] 

2147 

2148 # Get file metadata and internal metadata 

2149 if not isinstance(ref, DatasetRef): 

2150 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2151 # Assumed to be an iterable of refs so bulk mode enabled. 

2152 try: 

2153 self.bridge.moveToTrash(ref) 

2154 except Exception as e: 

2155 if ignore_errors: 

2156 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2157 else: 

2158 raise 

2159 return 

2160 

2161 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2162 

2163 fileLocations = self._get_dataset_locations_info(ref) 

2164 

2165 if not fileLocations: 

2166 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2167 if ignore_errors: 

2168 log.warning(err_msg) 

2169 return 

2170 else: 

2171 raise FileNotFoundError(err_msg) 

2172 

2173 for location, storedFileInfo in fileLocations: 

2174 if not self._artifact_exists(location): 2174 ↛ 2175line 2174 didn't jump to line 2175

2175 err_msg = ( 

2176 f"Dataset is known to datastore {self.name} but " 

2177 f"associated artifact ({location.uri}) is missing" 

2178 ) 

2179 if ignore_errors: 

2180 log.warning(err_msg) 

2181 return 

2182 else: 

2183 raise FileNotFoundError(err_msg) 

2184 

2185 # Mark dataset as trashed 

2186 try: 

2187 self.bridge.moveToTrash([ref]) 

2188 except Exception as e: 

2189 if ignore_errors: 

2190 log.warning( 

2191 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2192 "but encountered an error: %s", 

2193 ref, 

2194 self.name, 

2195 e, 

2196 ) 

2197 pass 

2198 else: 

2199 raise 

2200 

2201 @transactional 

2202 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2203 """Remove all datasets from the trash. 

2204 

2205 Parameters 

2206 ---------- 

2207 ignore_errors : `bool` 

2208 If `True` return without error even if something went wrong. 

2209 Problems could occur if another process is simultaneously trying 

2210 to delete. 

2211 """ 

2212 log.debug("Emptying trash in datastore %s", self.name) 

2213 

2214 # Context manager will empty trash iff we finish it without raising. 

2215 # It will also automatically delete the relevant rows from the 

2216 # trash table and the records table. 

2217 with self.bridge.emptyTrash( 

2218 self._table, record_class=StoredFileInfo, record_column="path" 

2219 ) as trash_data: 

2220 # Removing the artifacts themselves requires that the files are 

2221 # not also associated with refs that are not to be trashed. 

2222 # Therefore need to do a query with the file paths themselves 

2223 # and return all the refs associated with them. Can only delete 

2224 # a file if the refs to be trashed are the only refs associated 

2225 # with the file. 

2226 # This requires multiple copies of the trashed items 

2227 trashed, artifacts_to_keep = trash_data 

2228 

2229 if artifacts_to_keep is None: 

2230 # The bridge is not helping us so have to work it out 

2231 # ourselves. This is not going to be as efficient. 

2232 trashed = list(trashed) 

2233 

2234 # The instance check is for mypy since up to this point it 

2235 # does not know the type of info. 

2236 path_map = self._refs_associated_with_artifacts( 

2237 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2238 ) 

2239 

2240 for ref, info in trashed: 

2241 

2242 # Mypy needs to know this is not the base class 

2243 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2244 

2245 # Check for mypy 

2246 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2247 

2248 path_map[info.path].remove(ref.id) 

2249 if not path_map[info.path]: 2249 ↛ 2240line 2249 didn't jump to line 2240, because the condition on line 2249 was never false

2250 del path_map[info.path] 

2251 

2252 artifacts_to_keep = set(path_map) 

2253 

2254 for ref, info in trashed: 

2255 

2256 # Should not happen for this implementation but need 

2257 # to keep mypy happy. 

2258 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2259 

2260 # Mypy needs to know this is not the base class 

2261 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2262 

2263 # Check for mypy 

2264 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2265 

2266 if info.path in artifacts_to_keep: 

2267 # This is a multi-dataset artifact and we are not 

2268 # removing all associated refs. 

2269 continue 

2270 

2271 # Only trashed refs still known to datastore will be returned. 

2272 location = info.file_location(self.locationFactory) 

2273 

2274 # Point of no return for this artifact 

2275 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2276 try: 

2277 self._delete_artifact(location) 

2278 except FileNotFoundError: 

2279 # If the file itself has been deleted there is nothing 

2280 # we can do about it. It is possible that trash has 

2281 # been run in parallel in another process or someone 

2282 # decided to delete the file. It is unlikely to come 

2283 # back and so we should still continue with the removal 

2284 # of the entry from the trash table. It is also possible 

2285 # we removed it in a previous iteration if it was 

2286 # a multi-dataset artifact. The delete artifact method 

2287 # will log a debug message in this scenario. 

2288 # Distinguishing file missing before trash started and 

2289 # file already removed previously as part of this trash 

2290 # is not worth the distinction with regards to potential 

2291 # memory cost. 

2292 pass 

2293 except Exception as e: 

2294 if ignore_errors: 

2295 # Use a debug message here even though it's not 

2296 # a good situation. In some cases this can be 

2297 # caused by a race between user A and user B 

2298 # and neither of them has permissions for the 

2299 # other's files. Butler does not know about users 

2300 # and trash has no idea what collections these 

2301 # files were in (without guessing from a path). 

2302 log.debug( 

2303 "Encountered error removing artifact %s from datastore %s: %s", 

2304 location.uri, 

2305 self.name, 

2306 e, 

2307 ) 

2308 else: 

2309 raise 

2310 

2311 @transactional 

2312 def transfer_from( 

2313 self, 

2314 source_datastore: Datastore, 

2315 refs: Iterable[DatasetRef], 

2316 local_refs: Optional[Iterable[DatasetRef]] = None, 

2317 transfer: str = "auto", 

2318 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2319 ) -> None: 

2320 # Docstring inherited 

2321 if type(self) is not type(source_datastore): 

2322 raise TypeError( 

2323 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2324 f"source datastore ({type(source_datastore)})." 

2325 ) 

2326 

2327 # Be explicit for mypy 

2328 if not isinstance(source_datastore, FileDatastore): 2328 ↛ 2329line 2328 didn't jump to line 2329, because the condition on line 2328 was never true

2329 raise TypeError( 

2330 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2331 f" {type(source_datastore)}" 

2332 ) 

2333 

2334 # Stop early if "direct" transfer mode is requested. That would 

2335 # require that the URI inside the source datastore should be stored 

2336 # directly in the target datastore, which seems unlikely to be useful 

2337 # since at any moment the source datastore could delete the file. 

2338 if transfer in ("direct", "split"): 

2339 raise ValueError( 

2340 f"Can not transfer from a source datastore using {transfer} mode since" 

2341 " those files are controlled by the other datastore." 

2342 ) 

2343 

2344 # Empty existence lookup if none given. 

2345 if artifact_existence is None: 

2346 artifact_existence = {} 

2347 

2348 # We will go through the list multiple times so must convert 

2349 # generators to lists. 

2350 refs = list(refs) 

2351 

2352 if local_refs is None: 

2353 local_refs = refs 

2354 else: 

2355 local_refs = list(local_refs) 

2356 

2357 # In order to handle disassembled composites the code works 

2358 # at the records level since it can assume that internal APIs 

2359 # can be used. 

2360 # - If the record already exists in the destination this is assumed 

2361 # to be okay. 

2362 # - If there is no record but the source and destination URIs are 

2363 # identical no transfer is done but the record is added. 

2364 # - If the source record refers to an absolute URI currently assume 

2365 # that that URI should remain absolute and will be visible to the 

2366 # destination butler. May need to have a flag to indicate whether 

2367 # the dataset should be transferred. This will only happen if 

2368 # the detached Butler has had a local ingest. 

2369 

2370 # What we really want is all the records in the source datastore 

2371 # associated with these refs. Or derived ones if they don't exist 

2372 # in the source. 

2373 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2374 

2375 # The source dataset_ids are the keys in these records 

2376 source_ids = set(source_records) 

2377 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2378 

2379 # The not None check is to appease mypy 

2380 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2381 missing_ids = requested_ids - source_ids 

2382 

2383 # Missing IDs can be okay if that datastore has allowed 

2384 # gets based on file existence. Should we transfer what we can 

2385 # or complain about it and warn? 

2386 if missing_ids and not source_datastore.trustGetRequest: 2386 ↛ 2387line 2386 didn't jump to line 2387, because the condition on line 2386 was never true

2387 raise ValueError( 

2388 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2389 ) 

2390 

2391 # Need to map these missing IDs to a DatasetRef so we can guess 

2392 # the details. 

2393 if missing_ids: 

2394 log.info( 

2395 "Number of expected datasets missing from source datastore records: %d out of %d", 

2396 len(missing_ids), 

2397 len(requested_ids), 

2398 ) 

2399 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2400 

2401 # This should be chunked in case we end up having to check 

2402 # the file store since we need some log output to show 

2403 # progress. 

2404 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2405 records = {} 

2406 for missing in missing_ids_chunk: 

2407 # Ask the source datastore where the missing artifacts 

2408 # should be. An execution butler might not know about the 

2409 # artifacts even if they are there. 

2410 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2411 records[missing] = [info for _, info in expected] 

2412 

2413 # Call the mexist helper method in case we have not already 

2414 # checked these artifacts such that artifact_existence is 

2415 # empty. This allows us to benefit from parallelism. 

2416 # datastore.mexists() itself does not give us access to the 

2417 # derived datastore record. 

2418 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2419 ref_exists = source_datastore._process_mexists_records( 

2420 id_to_ref, records, False, artifact_existence=artifact_existence 

2421 ) 

2422 

2423 # Now go through the records and propagate the ones that exist. 

2424 location_factory = source_datastore.locationFactory 

2425 for missing, record_list in records.items(): 

2426 # Skip completely if the ref does not exist. 

2427 ref = id_to_ref[missing] 

2428 if not ref_exists[ref]: 

2429 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2430 continue 

2431 # Check for file artifact to decide which parts of a 

2432 # disassembled composite do exist. If there is only a 

2433 # single record we don't even need to look because it can't 

2434 # be a composite and must exist. 

2435 if len(record_list) == 1: 

2436 dataset_records = record_list 

2437 else: 

2438 dataset_records = [ 

2439 record 

2440 for record in record_list 

2441 if artifact_existence[record.file_location(location_factory).uri] 

2442 ] 

2443 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2444 

2445 # Rely on source_records being a defaultdict. 

2446 source_records[missing].extend(dataset_records) 

2447 

2448 # See if we already have these records 

2449 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2450 

2451 # The artifacts to register 

2452 artifacts = [] 

2453 

2454 # Refs that already exist 

2455 already_present = [] 

2456 

2457 # Now can transfer the artifacts 

2458 for source_ref, target_ref in zip(refs, local_refs): 

2459 if target_ref.id in target_records: 

2460 # Already have an artifact for this. 

2461 already_present.append(target_ref) 

2462 continue 

2463 

2464 # mypy needs to know these are always resolved refs 

2465 for info in source_records[source_ref.getCheckedId()]: 

2466 source_location = info.file_location(source_datastore.locationFactory) 

2467 target_location = info.file_location(self.locationFactory) 

2468 if source_location == target_location: 2468 ↛ 2472line 2468 didn't jump to line 2472, because the condition on line 2468 was never true

2469 # Either the dataset is already in the target datastore 

2470 # (which is how execution butler currently runs) or 

2471 # it is an absolute URI. 

2472 if source_location.pathInStore.isabs(): 

2473 # Just because we can see the artifact when running 

2474 # the transfer doesn't mean it will be generally 

2475 # accessible to a user of this butler. For now warn 

2476 # but assume it will be accessible. 

2477 log.warning( 

2478 "Transfer request for an outside-datastore artifact has been found at %s", 

2479 source_location, 

2480 ) 

2481 else: 

2482 # Need to transfer it to the new location. 

2483 # Assume we should always overwrite. If the artifact 

2484 # is there this might indicate that a previous transfer 

2485 # was interrupted but was not able to be rolled back 

2486 # completely (eg pre-emption) so follow Datastore default 

2487 # and overwrite. 

2488 target_location.uri.transfer_from( 

2489 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2490 ) 

2491 

2492 artifacts.append((target_ref, info)) 

2493 

2494 self._register_datasets(artifacts) 

2495 

2496 if already_present: 

2497 n_skipped = len(already_present) 

2498 log.info( 

2499 "Skipped transfer of %d dataset%s already present in datastore", 

2500 n_skipped, 

2501 "" if n_skipped == 1 else "s", 

2502 ) 

2503 

2504 @transactional 

2505 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2506 # Docstring inherited. 

2507 refs = list(refs) 

2508 self.bridge.forget(refs) 

2509 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2510 

2511 def validateConfiguration( 

2512 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2513 ) -> None: 

2514 """Validate some of the configuration for this datastore. 

2515 

2516 Parameters 

2517 ---------- 

2518 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2519 Entities to test against this configuration. Can be differing 

2520 types. 

2521 logFailures : `bool`, optional 

2522 If `True`, output a log message for every validation error 

2523 detected. 

2524 

2525 Raises 

2526 ------ 

2527 DatastoreValidationError 

2528 Raised if there is a validation problem with a configuration. 

2529 All the problems are reported in a single exception. 

2530 

2531 Notes 

2532 ----- 

2533 This method checks that all the supplied entities have valid file 

2534 templates and also have formatters defined. 

2535 """ 

2536 

2537 templateFailed = None 

2538 try: 

2539 self.templates.validateTemplates(entities, logFailures=logFailures) 

2540 except FileTemplateValidationError as e: 

2541 templateFailed = str(e) 

2542 

2543 formatterFailed = [] 

2544 for entity in entities: 

2545 try: 

2546 self.formatterFactory.getFormatterClass(entity) 

2547 except KeyError as e: 

2548 formatterFailed.append(str(e)) 

2549 if logFailures: 2549 ↛ 2544line 2549 didn't jump to line 2544, because the condition on line 2549 was never false

2550 log.critical("Formatter failure: %s", e) 

2551 

2552 if templateFailed or formatterFailed: 

2553 messages = [] 

2554 if templateFailed: 2554 ↛ 2555line 2554 didn't jump to line 2555, because the condition on line 2554 was never true

2555 messages.append(templateFailed) 

2556 if formatterFailed: 2556 ↛ 2558line 2556 didn't jump to line 2558, because the condition on line 2556 was never false

2557 messages.append(",".join(formatterFailed)) 

2558 msg = ";\n".join(messages) 

2559 raise DatastoreValidationError(msg) 

2560 

2561 def getLookupKeys(self) -> Set[LookupKey]: 

2562 # Docstring is inherited from base class 

2563 return ( 

2564 self.templates.getLookupKeys() 

2565 | self.formatterFactory.getLookupKeys() 

2566 | self.constraints.getLookupKeys() 

2567 ) 

2568 

2569 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2570 # Docstring is inherited from base class 

2571 # The key can be valid in either formatters or templates so we can 

2572 # only check the template if it exists 

2573 if lookupKey in self.templates: 

2574 try: 

2575 self.templates[lookupKey].validateTemplate(entity) 

2576 except FileTemplateValidationError as e: 

2577 raise DatastoreValidationError(e) from e 

2578 

2579 def export( 

2580 self, 

2581 refs: Iterable[DatasetRef], 

2582 *, 

2583 directory: Optional[ResourcePathExpression] = None, 

2584 transfer: Optional[str] = "auto", 

2585 ) -> Iterable[FileDataset]: 

2586 # Docstring inherited from Datastore.export. 

2587 if transfer is not None and directory is None: 2587 ↛ 2588line 2587 didn't jump to line 2588, because the condition on line 2587 was never true

2588 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2589 

2590 # Force the directory to be a URI object 

2591 directoryUri: Optional[ResourcePath] = None 

2592 if directory is not None: 2592 ↛ 2595line 2592 didn't jump to line 2595, because the condition on line 2592 was never false

2593 directoryUri = ResourcePath(directory, forceDirectory=True) 

2594 

2595 if transfer is not None and directoryUri is not None: 2595 ↛ 2600line 2595 didn't jump to line 2600, because the condition on line 2595 was never false

2596 # mypy needs the second test 

2597 if not directoryUri.exists(): 2597 ↛ 2598line 2597 didn't jump to line 2598, because the condition on line 2597 was never true

2598 raise FileNotFoundError(f"Export location {directory} does not exist") 

2599 

2600 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2601 for ref in progress.wrap(refs, "Exporting dataset files"): 

2602 fileLocations = self._get_dataset_locations_info(ref) 

2603 if not fileLocations: 2603 ↛ 2604line 2603 didn't jump to line 2604, because the condition on line 2603 was never true

2604 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2605 # For now we can not export disassembled datasets 

2606 if len(fileLocations) > 1: 

2607 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2608 location, storedFileInfo = fileLocations[0] 

2609 

2610 pathInStore = location.pathInStore.path 

2611 if transfer is None: 2611 ↛ 2615line 2611 didn't jump to line 2615, because the condition on line 2611 was never true

2612 # TODO: do we also need to return the readStorageClass somehow? 

2613 # We will use the path in store directly. If this is an 

2614 # absolute URI, preserve it. 

2615 if location.pathInStore.isabs(): 

2616 pathInStore = str(location.uri) 

2617 elif transfer == "direct": 2617 ↛ 2619line 2617 didn't jump to line 2619, because the condition on line 2617 was never true

2618 # Use full URIs to the remote store in the export 

2619 pathInStore = str(location.uri) 

2620 else: 

2621 # mypy needs help 

2622 assert directoryUri is not None, "directoryUri must be defined to get here" 

2623 storeUri = ResourcePath(location.uri) 

2624 

2625 # if the datastore has an absolute URI to a resource, we 

2626 # have two options: 

2627 # 1. Keep the absolute URI in the exported YAML 

2628 # 2. Allocate a new name in the local datastore and transfer 

2629 # it. 

2630 # For now go with option 2 

2631 if location.pathInStore.isabs(): 2631 ↛ 2632line 2631 didn't jump to line 2632, because the condition on line 2631 was never true

2632 template = self.templates.getTemplate(ref) 

2633 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2634 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2635 

2636 exportUri = directoryUri.join(pathInStore) 

2637 exportUri.transfer_from(storeUri, transfer=transfer) 

2638 

2639 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2640 

2641 @staticmethod 

2642 def computeChecksum( 

2643 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2644 ) -> Optional[str]: 

2645 """Compute the checksum of the supplied file. 

2646 

2647 Parameters 

2648 ---------- 

2649 uri : `lsst.resources.ResourcePath` 

2650 Name of resource to calculate checksum from. 

2651 algorithm : `str`, optional 

2652 Name of algorithm to use. Must be one of the algorithms supported 

2653 by :py:class`hashlib`. 

2654 block_size : `int` 

2655 Number of bytes to read from file at one time. 

2656 

2657 Returns 

2658 ------- 

2659 hexdigest : `str` 

2660 Hex digest of the file. 

2661 

2662 Notes 

2663 ----- 

2664 Currently returns None if the URI is for a remote resource. 

2665 """ 

2666 if algorithm not in hashlib.algorithms_guaranteed: 2666 ↛ 2667line 2666 didn't jump to line 2667, because the condition on line 2666 was never true

2667 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2668 

2669 if not uri.isLocal: 2669 ↛ 2670line 2669 didn't jump to line 2670, because the condition on line 2669 was never true

2670 return None 

2671 

2672 hasher = hashlib.new(algorithm) 

2673 

2674 with uri.as_local() as local_uri: 

2675 with open(local_uri.ospath, "rb") as f: 

2676 for chunk in iter(lambda: f.read(block_size), b""): 

2677 hasher.update(chunk) 

2678 

2679 return hasher.hexdigest() 

2680 

2681 def needs_expanded_data_ids( 

2682 self, 

2683 transfer: Optional[str], 

2684 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2685 ) -> bool: 

2686 # Docstring inherited. 

2687 # This _could_ also use entity to inspect whether the filename template 

2688 # involves placeholders other than the required dimensions for its 

2689 # dataset type, but that's not necessary for correctness; it just 

2690 # enables more optimizations (perhaps only in theory). 

2691 return transfer not in ("direct", None)