Coverage for python/lsst/daf/butler/datastores/fileDatastore.py: 84%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

860 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore",) 

26 

27import hashlib 

28import logging 

29from collections import defaultdict 

30from dataclasses import dataclass 

31from typing import ( 

32 TYPE_CHECKING, 

33 Any, 

34 ClassVar, 

35 Dict, 

36 Iterable, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Tuple, 

42 Type, 

43 Union, 

44) 

45 

46from lsst.daf.butler import ( 

47 CompositesMap, 

48 Config, 

49 DatasetId, 

50 DatasetRef, 

51 DatasetType, 

52 DatasetTypeNotSupportedError, 

53 Datastore, 

54 DatastoreCacheManager, 

55 DatastoreConfig, 

56 DatastoreDisabledCacheManager, 

57 DatastoreValidationError, 

58 FileDataset, 

59 FileDescriptor, 

60 FileTemplates, 

61 FileTemplateValidationError, 

62 Formatter, 

63 FormatterFactory, 

64 Location, 

65 LocationFactory, 

66 Progress, 

67 StorageClass, 

68 StoredFileInfo, 

69 ddl, 

70) 

71from lsst.daf.butler.core.repoRelocation import replaceRoot 

72from lsst.daf.butler.core.utils import transactional 

73from lsst.daf.butler.registry.interfaces import DatastoreRegistryBridge, ReadOnlyDatabaseError 

74from lsst.resources import ResourcePath, ResourcePathExpression 

75from lsst.utils.introspection import get_class_of, get_instance_of 

76from lsst.utils.iteration import chunk_iterable 

77 

78# For VERBOSE logging usage. 

79from lsst.utils.logging import VERBOSE, getLogger 

80from lsst.utils.timer import time_this 

81from sqlalchemy import BigInteger, String 

82 

83from .genericDatastore import GenericBaseDatastore 

84 

85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 from lsst.daf.butler import AbstractDatastoreCacheManager, LookupKey 

87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

88 

89log = getLogger(__name__) 

90 

91 

92class _IngestPrepData(Datastore.IngestPrepData): 

93 """Helper class for FileDatastore ingest implementation. 

94 

95 Parameters 

96 ---------- 

97 datasets : `list` of `FileDataset` 

98 Files to be ingested by this datastore. 

99 """ 

100 

101 def __init__(self, datasets: List[FileDataset]): 

102 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

103 self.datasets = datasets 

104 

105 

106@dataclass(frozen=True) 

107class DatastoreFileGetInformation: 

108 """Collection of useful parameters needed to retrieve a file from 

109 a Datastore. 

110 """ 

111 

112 location: Location 

113 """The location from which to read the dataset.""" 

114 

115 formatter: Formatter 

116 """The `Formatter` to use to deserialize the dataset.""" 

117 

118 info: StoredFileInfo 

119 """Stored information about this file and its formatter.""" 

120 

121 assemblerParams: Dict[str, Any] 

122 """Parameters to use for post-processing the retrieved dataset.""" 

123 

124 formatterParams: Dict[str, Any] 

125 """Parameters that were understood by the associated formatter.""" 

126 

127 component: Optional[str] 

128 """The component to be retrieved (can be `None`).""" 

129 

130 readStorageClass: StorageClass 

131 """The `StorageClass` of the dataset being read.""" 

132 

133 

134class FileDatastore(GenericBaseDatastore): 

135 """Generic Datastore for file-based implementations. 

136 

137 Should always be sub-classed since key abstract methods are missing. 

138 

139 Parameters 

140 ---------- 

141 config : `DatastoreConfig` or `str` 

142 Configuration as either a `Config` object or URI to file. 

143 bridgeManager : `DatastoreRegistryBridgeManager` 

144 Object that manages the interface between `Registry` and datastores. 

145 butlerRoot : `str`, optional 

146 New datastore root to use to override the configuration value. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 If root location does not exist and ``create`` is `False` in the 

152 configuration. 

153 """ 

154 

155 defaultConfigFile: ClassVar[Optional[str]] = None 

156 """Path to configuration defaults. Accessed within the ``config`` resource 

157 or relative to a search path. Can be None if no defaults specified. 

158 """ 

159 

160 root: ResourcePath 

161 """Root directory URI of this `Datastore`.""" 

162 

163 locationFactory: LocationFactory 

164 """Factory for creating locations relative to the datastore root.""" 

165 

166 formatterFactory: FormatterFactory 

167 """Factory for creating instances of formatters.""" 

168 

169 templates: FileTemplates 

170 """File templates that can be used by this `Datastore`.""" 

171 

172 composites: CompositesMap 

173 """Determines whether a dataset should be disassembled on put.""" 

174 

175 defaultConfigFile = "datastores/fileDatastore.yaml" 

176 """Path to configuration defaults. Accessed within the ``config`` resource 

177 or relative to a search path. Can be None if no defaults specified. 

178 """ 

179 

180 @classmethod 

181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

182 """Set any filesystem-dependent config options for this Datastore to 

183 be appropriate for a new empty repository with the given root. 

184 

185 Parameters 

186 ---------- 

187 root : `str` 

188 URI to the root of the data repository. 

189 config : `Config` 

190 A `Config` to update. Only the subset understood by 

191 this component will be updated. Will not expand 

192 defaults. 

193 full : `Config` 

194 A complete config with all defaults expanded that can be 

195 converted to a `DatastoreConfig`. Read-only and will not be 

196 modified by this method. 

197 Repository-specific options that should not be obtained 

198 from defaults when Butler instances are constructed 

199 should be copied from ``full`` to ``config``. 

200 overwrite : `bool`, optional 

201 If `False`, do not modify a value in ``config`` if the value 

202 already exists. Default is always to overwrite with the provided 

203 ``root``. 

204 

205 Notes 

206 ----- 

207 If a keyword is explicitly defined in the supplied ``config`` it 

208 will not be overridden by this method if ``overwrite`` is `False`. 

209 This allows explicit values set in external configs to be retained. 

210 """ 

211 Config.updateParameters( 

212 DatastoreConfig, 

213 config, 

214 full, 

215 toUpdate={"root": root}, 

216 toCopy=("cls", ("records", "table")), 

217 overwrite=overwrite, 

218 ) 

219 

220 @classmethod 

221 def makeTableSpec(cls, datasetIdColumnType: type) -> ddl.TableSpec: 

222 return ddl.TableSpec( 

223 fields=[ 

224 ddl.FieldSpec(name="dataset_id", dtype=datasetIdColumnType, primaryKey=True), 

225 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

226 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

227 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

228 # Use empty string to indicate no component 

229 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

230 # TODO: should checksum be Base64Bytes instead? 

231 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

232 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

233 ], 

234 unique=frozenset(), 

235 indexes=[tuple(["path"])], 

236 ) 

237 

238 def __init__( 

239 self, 

240 config: Union[DatastoreConfig, str], 

241 bridgeManager: DatastoreRegistryBridgeManager, 

242 butlerRoot: str = None, 

243 ): 

244 super().__init__(config, bridgeManager) 

245 if "root" not in self.config: 245 ↛ 246line 245 didn't jump to line 246, because the condition on line 245 was never true

246 raise ValueError("No root directory specified in configuration") 

247 

248 # Name ourselves either using an explicit name or a name 

249 # derived from the (unexpanded) root 

250 if "name" in self.config: 

251 self.name = self.config["name"] 

252 else: 

253 # We use the unexpanded root in the name to indicate that this 

254 # datastore can be moved without having to update registry. 

255 self.name = "{}@{}".format(type(self).__name__, self.config["root"]) 

256 

257 # Support repository relocation in config 

258 # Existence of self.root is checked in subclass 

259 self.root = ResourcePath( 

260 replaceRoot(self.config["root"], butlerRoot), forceDirectory=True, forceAbsolute=True 

261 ) 

262 

263 self.locationFactory = LocationFactory(self.root) 

264 self.formatterFactory = FormatterFactory() 

265 

266 # Now associate formatters with storage classes 

267 self.formatterFactory.registerFormatters(self.config["formatters"], universe=bridgeManager.universe) 

268 

269 # Read the file naming templates 

270 self.templates = FileTemplates(self.config["templates"], universe=bridgeManager.universe) 

271 

272 # See if composites should be disassembled 

273 self.composites = CompositesMap(self.config["composites"], universe=bridgeManager.universe) 

274 

275 tableName = self.config["records", "table"] 

276 try: 

277 # Storage of paths and formatters, keyed by dataset_id 

278 self._table = bridgeManager.opaque.register( 

279 tableName, self.makeTableSpec(bridgeManager.datasetIdColumnType) 

280 ) 

281 # Interface to Registry. 

282 self._bridge = bridgeManager.register(self.name) 

283 except ReadOnlyDatabaseError: 

284 # If the database is read only and we just tried and failed to 

285 # create a table, it means someone is trying to create a read-only 

286 # butler client for an empty repo. That should be okay, as long 

287 # as they then try to get any datasets before some other client 

288 # creates the table. Chances are they'rejust validating 

289 # configuration. 

290 pass 

291 

292 # Determine whether checksums should be used - default to False 

293 self.useChecksum = self.config.get("checksum", False) 

294 

295 # Determine whether we can fall back to configuration if a 

296 # requested dataset is not known to registry 

297 self.trustGetRequest = self.config.get("trust_get_request", False) 

298 

299 # Create a cache manager 

300 self.cacheManager: AbstractDatastoreCacheManager 

301 if "cached" in self.config: 301 ↛ 304line 301 didn't jump to line 304, because the condition on line 301 was never false

302 self.cacheManager = DatastoreCacheManager(self.config["cached"], universe=bridgeManager.universe) 

303 else: 

304 self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) 

305 

306 # Check existence and create directory structure if necessary 

307 if not self.root.exists(): 

308 if "create" not in self.config or not self.config["create"]: 308 ↛ 309line 308 didn't jump to line 309, because the condition on line 308 was never true

309 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

310 try: 

311 self.root.mkdir() 

312 except Exception as e: 

313 raise ValueError( 

314 f"Can not create datastore root '{self.root}', check permissions. Got error: {e}" 

315 ) from e 

316 

317 def __str__(self) -> str: 

318 return str(self.root) 

319 

320 @property 

321 def bridge(self) -> DatastoreRegistryBridge: 

322 return self._bridge 

323 

324 def _artifact_exists(self, location: Location) -> bool: 

325 """Check that an artifact exists in this datastore at the specified 

326 location. 

327 

328 Parameters 

329 ---------- 

330 location : `Location` 

331 Expected location of the artifact associated with this datastore. 

332 

333 Returns 

334 ------- 

335 exists : `bool` 

336 True if the location can be found, false otherwise. 

337 """ 

338 log.debug("Checking if resource exists: %s", location.uri) 

339 return location.uri.exists() 

340 

341 def _delete_artifact(self, location: Location) -> None: 

342 """Delete the artifact from the datastore. 

343 

344 Parameters 

345 ---------- 

346 location : `Location` 

347 Location of the artifact associated with this datastore. 

348 """ 

349 if location.pathInStore.isabs(): 349 ↛ 350line 349 didn't jump to line 350, because the condition on line 349 was never true

350 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

351 

352 try: 

353 location.uri.remove() 

354 except FileNotFoundError: 

355 log.debug("File %s did not exist and so could not be deleted.", location.uri) 

356 raise 

357 except Exception as e: 

358 log.critical("Failed to delete file: %s (%s)", location.uri, e) 

359 raise 

360 log.debug("Successfully deleted file: %s", location.uri) 

361 

362 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

363 # Docstring inherited from GenericBaseDatastore 

364 records = [info.to_record(ref) for ref, info in zip(refs, infos)] 

365 self._table.insert(*records) 

366 

367 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

368 # Docstring inherited from GenericBaseDatastore 

369 

370 # Look for the dataset_id -- there might be multiple matches 

371 # if we have disassembled the dataset. 

372 records = self._table.fetch(dataset_id=ref.id) 

373 return [StoredFileInfo.from_record(record) for record in records] 

374 

375 def _get_stored_records_associated_with_refs( 

376 self, refs: Iterable[DatasetIdRef] 

377 ) -> Dict[DatasetId, List[StoredFileInfo]]: 

378 """Retrieve all records associated with the provided refs. 

379 

380 Parameters 

381 ---------- 

382 refs : iterable of `DatasetIdRef` 

383 The refs for which records are to be retrieved. 

384 

385 Returns 

386 ------- 

387 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

388 The matching records indexed by the ref ID. The number of entries 

389 in the dict can be smaller than the number of requested refs. 

390 """ 

391 records = self._table.fetch(dataset_id=[ref.id for ref in refs]) 

392 

393 # Uniqueness is dataset_id + component so can have multiple records 

394 # per ref. 

395 records_by_ref = defaultdict(list) 

396 for record in records: 

397 records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) 

398 return records_by_ref 

399 

400 def _refs_associated_with_artifacts( 

401 self, paths: List[Union[str, ResourcePath]] 

402 ) -> Dict[str, Set[DatasetId]]: 

403 """Return paths and associated dataset refs. 

404 

405 Parameters 

406 ---------- 

407 paths : `list` of `str` or `lsst.resources.ResourcePath` 

408 All the paths to include in search. 

409 

410 Returns 

411 ------- 

412 mapping : `dict` of [`str`, `set` [`DatasetId`]] 

413 Mapping of each path to a set of associated database IDs. 

414 """ 

415 records = self._table.fetch(path=[str(path) for path in paths]) 

416 result = defaultdict(set) 

417 for row in records: 

418 result[row["path"]].add(row["dataset_id"]) 

419 return result 

420 

421 def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> Set[DatasetId]: 

422 """Return all dataset refs associated with the supplied path. 

423 

424 Parameters 

425 ---------- 

426 pathInStore : `lsst.resources.ResourcePath` 

427 Path of interest in the data store. 

428 

429 Returns 

430 ------- 

431 ids : `set` of `int` 

432 All `DatasetRef` IDs associated with this path. 

433 """ 

434 records = list(self._table.fetch(path=str(pathInStore))) 

435 ids = {r["dataset_id"] for r in records} 

436 return ids 

437 

438 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

439 # Docstring inherited from GenericBaseDatastore 

440 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

441 

442 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

443 r"""Find all the `Location`\ s of the requested dataset in the 

444 `Datastore` and the associated stored file information. 

445 

446 Parameters 

447 ---------- 

448 ref : `DatasetRef` 

449 Reference to the required `Dataset`. 

450 

451 Returns 

452 ------- 

453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

454 Location of the dataset within the datastore and 

455 stored information about each file and its formatter. 

456 """ 

457 # Get the file information (this will fail if no file) 

458 records = self.getStoredItemsInfo(ref) 

459 

460 # Use the path to determine the location -- we need to take 

461 # into account absolute URIs in the datastore record 

462 return [(r.file_location(self.locationFactory), r) for r in records] 

463 

464 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

465 """Check that there is only one dataset associated with the 

466 specified artifact. 

467 

468 Parameters 

469 ---------- 

470 ref : `DatasetRef` or `FakeDatasetRef` 

471 Dataset to be removed. 

472 location : `Location` 

473 The location of the artifact to be removed. 

474 

475 Returns 

476 ------- 

477 can_remove : `Bool` 

478 True if the artifact can be safely removed. 

479 """ 

480 # Can't ever delete absolute URIs. 

481 if location.pathInStore.isabs(): 

482 return False 

483 

484 # Get all entries associated with this path 

485 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

486 if not allRefs: 

487 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

488 

489 # Remove these refs from all the refs and if there is nothing left 

490 # then we can delete 

491 remainingRefs = allRefs - {ref.id} 

492 

493 if remainingRefs: 

494 return False 

495 return True 

496 

497 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, StoredFileInfo]]: 

498 """Predict the location and related file information of the requested 

499 dataset in this datastore. 

500 

501 Parameters 

502 ---------- 

503 ref : `DatasetRef` 

504 Reference to the required `Dataset`. 

505 

506 Returns 

507 ------- 

508 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

509 Expected Location of the dataset within the datastore and 

510 placeholder information about each file and its formatter. 

511 

512 Notes 

513 ----- 

514 Uses the current configuration to determine how we would expect the 

515 datastore files to have been written if we couldn't ask registry. 

516 This is safe so long as there has been no change to datastore 

517 configuration between writing the dataset and wanting to read it. 

518 Will not work for files that have been ingested without using the 

519 standard file template or default formatter. 

520 """ 

521 

522 # If we have a component ref we always need to ask the questions 

523 # of the composite. If the composite is disassembled this routine 

524 # should return all components. If the composite was not 

525 # disassembled the composite is what is stored regardless of 

526 # component request. Note that if the caller has disassembled 

527 # a composite there is no way for this guess to know that 

528 # without trying both the composite and component ref and seeing 

529 # if there is something at the component Location even without 

530 # disassembly being enabled. 

531 if ref.datasetType.isComponent(): 

532 ref = ref.makeCompositeRef() 

533 

534 # See if the ref is a composite that should be disassembled 

535 doDisassembly = self.composites.shouldBeDisassembled(ref) 

536 

537 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

538 

539 if doDisassembly: 

540 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

541 compRef = ref.makeComponentRef(component) 

542 location, formatter = self._determine_put_formatter_location(compRef) 

543 all_info.append((location, formatter, componentStorage, component)) 

544 

545 else: 

546 # Always use the composite ref if no disassembly 

547 location, formatter = self._determine_put_formatter_location(ref) 

548 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

549 

550 # Convert the list of tuples to have StoredFileInfo as second element 

551 return [ 

552 ( 

553 location, 

554 StoredFileInfo( 

555 formatter=formatter, 

556 path=location.pathInStore.path, 

557 storageClass=storageClass, 

558 component=component, 

559 checksum=None, 

560 file_size=-1, 

561 ), 

562 ) 

563 for location, formatter, storageClass, component in all_info 

564 ] 

565 

566 def _prepare_for_get( 

567 self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None 

568 ) -> List[DatastoreFileGetInformation]: 

569 """Check parameters for ``get`` and obtain formatter and 

570 location. 

571 

572 Parameters 

573 ---------- 

574 ref : `DatasetRef` 

575 Reference to the required Dataset. 

576 parameters : `dict` 

577 `StorageClass`-specific parameters that specify, for example, 

578 a slice of the dataset to be loaded. 

579 

580 Returns 

581 ------- 

582 getInfo : `list` [`DatastoreFileGetInformation`] 

583 Parameters needed to retrieve each file. 

584 """ 

585 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

586 

587 # Get file metadata and internal metadata 

588 fileLocations = self._get_dataset_locations_info(ref) 

589 if not fileLocations: 

590 if not self.trustGetRequest: 

591 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

592 # Assume the dataset is where we think it should be 

593 fileLocations = self._get_expected_dataset_locations_info(ref) 

594 

595 # The storage class we want to use eventually 

596 refStorageClass = ref.datasetType.storageClass 

597 

598 if len(fileLocations) > 1: 

599 disassembled = True 

600 

601 # If trust is involved it is possible that there will be 

602 # components listed here that do not exist in the datastore. 

603 # Explicitly check for file artifact existence and filter out any 

604 # that are missing. 

605 if self.trustGetRequest: 

606 fileLocations = [loc for loc in fileLocations if loc[0].uri.exists()] 

607 

608 # For now complain only if we have no components at all. One 

609 # component is probably a problem but we can punt that to the 

610 # assembler. 

611 if not fileLocations: 611 ↛ 612line 611 didn't jump to line 612, because the condition on line 611 was never true

612 raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") 

613 

614 else: 

615 disassembled = False 

616 

617 # Is this a component request? 

618 refComponent = ref.datasetType.component() 

619 

620 fileGetInfo = [] 

621 for location, storedFileInfo in fileLocations: 

622 

623 # The storage class used to write the file 

624 writeStorageClass = storedFileInfo.storageClass 

625 

626 # If this has been disassembled we need read to match the write 

627 if disassembled: 

628 readStorageClass = writeStorageClass 

629 else: 

630 readStorageClass = refStorageClass 

631 

632 formatter = get_instance_of( 

633 storedFileInfo.formatter, 

634 FileDescriptor( 

635 location, 

636 readStorageClass=readStorageClass, 

637 storageClass=writeStorageClass, 

638 parameters=parameters, 

639 ), 

640 ref.dataId, 

641 ) 

642 

643 formatterParams, notFormatterParams = formatter.segregateParameters() 

644 

645 # Of the remaining parameters, extract the ones supported by 

646 # this StorageClass (for components not all will be handled) 

647 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

648 

649 # The ref itself could be a component if the dataset was 

650 # disassembled by butler, or we disassembled in datastore and 

651 # components came from the datastore records 

652 component = storedFileInfo.component if storedFileInfo.component else refComponent 

653 

654 fileGetInfo.append( 

655 DatastoreFileGetInformation( 

656 location, 

657 formatter, 

658 storedFileInfo, 

659 assemblerParams, 

660 formatterParams, 

661 component, 

662 readStorageClass, 

663 ) 

664 ) 

665 

666 return fileGetInfo 

667 

668 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

669 """Check the arguments for ``put`` and obtain formatter and 

670 location. 

671 

672 Parameters 

673 ---------- 

674 inMemoryDataset : `object` 

675 The dataset to store. 

676 ref : `DatasetRef` 

677 Reference to the associated Dataset. 

678 

679 Returns 

680 ------- 

681 location : `Location` 

682 The location to write the dataset. 

683 formatter : `Formatter` 

684 The `Formatter` to use to write the dataset. 

685 

686 Raises 

687 ------ 

688 TypeError 

689 Supplied object and storage class are inconsistent. 

690 DatasetTypeNotSupportedError 

691 The associated `DatasetType` is not handled by this datastore. 

692 """ 

693 self._validate_put_parameters(inMemoryDataset, ref) 

694 return self._determine_put_formatter_location(ref) 

695 

696 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

697 """Calculate the formatter and output location to use for put. 

698 

699 Parameters 

700 ---------- 

701 ref : `DatasetRef` 

702 Reference to the associated Dataset. 

703 

704 Returns 

705 ------- 

706 location : `Location` 

707 The location to write the dataset. 

708 formatter : `Formatter` 

709 The `Formatter` to use to write the dataset. 

710 """ 

711 # Work out output file name 

712 try: 

713 template = self.templates.getTemplate(ref) 

714 except KeyError as e: 

715 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

716 

717 # Validate the template to protect against filenames from different 

718 # dataIds returning the same and causing overwrite confusion. 

719 template.validateTemplate(ref) 

720 

721 location = self.locationFactory.fromPath(template.format(ref)) 

722 

723 # Get the formatter based on the storage class 

724 storageClass = ref.datasetType.storageClass 

725 try: 

726 formatter = self.formatterFactory.getFormatter( 

727 ref, FileDescriptor(location, storageClass=storageClass), ref.dataId 

728 ) 

729 except KeyError as e: 

730 raise DatasetTypeNotSupportedError( 

731 f"Unable to find formatter for {ref} in datastore {self.name}" 

732 ) from e 

733 

734 # Now that we know the formatter, update the location 

735 location = formatter.makeUpdatedLocation(location) 

736 

737 return location, formatter 

738 

739 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

740 # Docstring inherited from base class 

741 if transfer != "auto": 

742 return transfer 

743 

744 # See if the paths are within the datastore or not 

745 inside = [self._pathInStore(d.path) is not None for d in datasets] 

746 

747 if all(inside): 

748 transfer = None 

749 elif not any(inside): 749 ↛ 758line 749 didn't jump to line 758, because the condition on line 749 was never false

750 # Allow ResourcePath to use its own knowledge 

751 transfer = "auto" 

752 else: 

753 # This can happen when importing from a datastore that 

754 # has had some datasets ingested using "direct" mode. 

755 # Also allow ResourcePath to sort it out but warn about it. 

756 # This can happen if you are importing from a datastore 

757 # that had some direct transfer datasets. 

758 log.warning( 

759 "Some datasets are inside the datastore and some are outside. Using 'split' " 

760 "transfer mode. This assumes that the files outside the datastore are " 

761 "still accessible to the new butler since they will not be copied into " 

762 "the target datastore." 

763 ) 

764 transfer = "split" 

765 

766 return transfer 

767 

768 def _pathInStore(self, path: ResourcePathExpression) -> Optional[str]: 

769 """Return path relative to datastore root 

770 

771 Parameters 

772 ---------- 

773 path : `lsst.resources.ResourcePathExpression` 

774 Path to dataset. Can be absolute URI. If relative assumed to 

775 be relative to the datastore. Returns path in datastore 

776 or raises an exception if the path it outside. 

777 

778 Returns 

779 ------- 

780 inStore : `str` 

781 Path relative to datastore root. Returns `None` if the file is 

782 outside the root. 

783 """ 

784 # Relative path will always be relative to datastore 

785 pathUri = ResourcePath(path, forceAbsolute=False) 

786 return pathUri.relative_to(self.root) 

787 

788 def _standardizeIngestPath( 

789 self, path: Union[str, ResourcePath], *, transfer: Optional[str] = None 

790 ) -> Union[str, ResourcePath]: 

791 """Standardize the path of a to-be-ingested file. 

792 

793 Parameters 

794 ---------- 

795 path : `str` or `lsst.resources.ResourcePath` 

796 Path of a file to be ingested. This parameter is not expected 

797 to be all the types that can be used to construct a 

798 `~lsst.resources.ResourcePath`. 

799 transfer : `str`, optional 

800 How (and whether) the dataset should be added to the datastore. 

801 See `ingest` for details of transfer modes. 

802 This implementation is provided only so 

803 `NotImplementedError` can be raised if the mode is not supported; 

804 actual transfers are deferred to `_extractIngestInfo`. 

805 

806 Returns 

807 ------- 

808 path : `str` or `lsst.resources.ResourcePath` 

809 New path in what the datastore considers standard form. If an 

810 absolute URI was given that will be returned unchanged. 

811 

812 Notes 

813 ----- 

814 Subclasses of `FileDatastore` can implement this method instead 

815 of `_prepIngest`. It should not modify the data repository or given 

816 file in any way. 

817 

818 Raises 

819 ------ 

820 NotImplementedError 

821 Raised if the datastore does not support the given transfer mode 

822 (including the case where ingest is not supported at all). 

823 FileNotFoundError 

824 Raised if one of the given files does not exist. 

825 """ 

826 if transfer not in (None, "direct", "split") + self.root.transferModes: 826 ↛ 827line 826 didn't jump to line 827, because the condition on line 826 was never true

827 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

828 

829 # A relative URI indicates relative to datastore root 

830 srcUri = ResourcePath(path, forceAbsolute=False) 

831 if not srcUri.isabs(): 

832 srcUri = self.root.join(path) 

833 

834 if not srcUri.exists(): 

835 raise FileNotFoundError( 

836 f"Resource at {srcUri} does not exist; note that paths to ingest " 

837 f"are assumed to be relative to {self.root} unless they are absolute." 

838 ) 

839 

840 if transfer is None: 

841 relpath = srcUri.relative_to(self.root) 

842 if not relpath: 

843 raise RuntimeError( 

844 f"Transfer is none but source file ({srcUri}) is not within datastore ({self.root})" 

845 ) 

846 

847 # Return the relative path within the datastore for internal 

848 # transfer 

849 path = relpath 

850 

851 return path 

852 

853 def _extractIngestInfo( 

854 self, 

855 path: ResourcePathExpression, 

856 ref: DatasetRef, 

857 *, 

858 formatter: Union[Formatter, Type[Formatter]], 

859 transfer: Optional[str] = None, 

860 record_validation_info: bool = True, 

861 ) -> StoredFileInfo: 

862 """Relocate (if necessary) and extract `StoredFileInfo` from a 

863 to-be-ingested file. 

864 

865 Parameters 

866 ---------- 

867 path : `lsst.resources.ResourcePathExpression` 

868 URI or path of a file to be ingested. 

869 ref : `DatasetRef` 

870 Reference for the dataset being ingested. Guaranteed to have 

871 ``dataset_id not None`. 

872 formatter : `type` or `Formatter` 

873 `Formatter` subclass to use for this dataset or an instance. 

874 transfer : `str`, optional 

875 How (and whether) the dataset should be added to the datastore. 

876 See `ingest` for details of transfer modes. 

877 record_validation_info : `bool`, optional 

878 If `True`, the default, the datastore can record validation 

879 information associated with the file. If `False` the datastore 

880 will not attempt to track any information such as checksums 

881 or file sizes. This can be useful if such information is tracked 

882 in an external system or if the file is to be compressed in place. 

883 It is up to the datastore whether this parameter is relevant. 

884 

885 Returns 

886 ------- 

887 info : `StoredFileInfo` 

888 Internal datastore record for this file. This will be inserted by 

889 the caller; the `_extractIngestInfo` is only responsible for 

890 creating and populating the struct. 

891 

892 Raises 

893 ------ 

894 FileNotFoundError 

895 Raised if one of the given files does not exist. 

896 FileExistsError 

897 Raised if transfer is not `None` but the (internal) location the 

898 file would be moved to is already occupied. 

899 """ 

900 if self._transaction is None: 900 ↛ 901line 900 didn't jump to line 901, because the condition on line 900 was never true

901 raise RuntimeError("Ingest called without transaction enabled") 

902 

903 # Create URI of the source path, do not need to force a relative 

904 # path to absolute. 

905 srcUri = ResourcePath(path, forceAbsolute=False) 

906 

907 # Track whether we have read the size of the source yet 

908 have_sized = False 

909 

910 tgtLocation: Optional[Location] 

911 if transfer is None or transfer == "split": 

912 # A relative path is assumed to be relative to the datastore 

913 # in this context 

914 if not srcUri.isabs(): 

915 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

916 else: 

917 # Work out the path in the datastore from an absolute URI 

918 # This is required to be within the datastore. 

919 pathInStore = srcUri.relative_to(self.root) 

920 if pathInStore is None and transfer is None: 920 ↛ 921line 920 didn't jump to line 921, because the condition on line 920 was never true

921 raise RuntimeError( 

922 f"Unexpectedly learned that {srcUri} is not within datastore {self.root}" 

923 ) 

924 if pathInStore: 924 ↛ 926line 924 didn't jump to line 926, because the condition on line 924 was never false

925 tgtLocation = self.locationFactory.fromPath(pathInStore) 

926 elif transfer == "split": 

927 # Outside the datastore but treat that as a direct ingest 

928 # instead. 

929 tgtLocation = None 

930 else: 

931 raise RuntimeError(f"Unexpected transfer mode encountered: {transfer} for URI {srcUri}") 

932 elif transfer == "direct": 932 ↛ 937line 932 didn't jump to line 937, because the condition on line 932 was never true

933 # Want to store the full URI to the resource directly in 

934 # datastore. This is useful for referring to permanent archive 

935 # storage for raw data. 

936 # Trust that people know what they are doing. 

937 tgtLocation = None 

938 else: 

939 # Work out the name we want this ingested file to have 

940 # inside the datastore 

941 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

942 if not tgtLocation.uri.dirname().exists(): 

943 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

944 tgtLocation.uri.dirname().mkdir() 

945 

946 # if we are transferring from a local file to a remote location 

947 # it may be more efficient to get the size and checksum of the 

948 # local file rather than the transferred one 

949 if record_validation_info and srcUri.isLocal: 

950 size = srcUri.size() 

951 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

952 have_sized = True 

953 

954 # Transfer the resource to the destination. 

955 # Allow overwrite of an existing file. This matches the behavior 

956 # of datastore.put() in that it trusts that registry would not 

957 # be asking to overwrite unless registry thought that the 

958 # overwrite was allowed. 

959 tgtLocation.uri.transfer_from( 

960 srcUri, transfer=transfer, transaction=self._transaction, overwrite=True 

961 ) 

962 

963 if tgtLocation is None: 963 ↛ 965line 963 didn't jump to line 965, because the condition on line 963 was never true

964 # This means we are using direct mode 

965 targetUri = srcUri 

966 targetPath = str(srcUri) 

967 else: 

968 targetUri = tgtLocation.uri 

969 targetPath = tgtLocation.pathInStore.path 

970 

971 # the file should exist in the datastore now 

972 if record_validation_info: 

973 if not have_sized: 

974 size = targetUri.size() 

975 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

976 else: 

977 # Not recording any file information. 

978 size = -1 

979 checksum = None 

980 

981 return StoredFileInfo( 

982 formatter=formatter, 

983 path=targetPath, 

984 storageClass=ref.datasetType.storageClass, 

985 component=ref.datasetType.component(), 

986 file_size=size, 

987 checksum=checksum, 

988 ) 

989 

990 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

991 # Docstring inherited from Datastore._prepIngest. 

992 filtered = [] 

993 for dataset in datasets: 

994 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

995 if not acceptable: 

996 continue 

997 else: 

998 dataset.refs = acceptable 

999 if dataset.formatter is None: 

1000 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

1001 else: 

1002 assert isinstance(dataset.formatter, (type, str)) 

1003 formatter_class = get_class_of(dataset.formatter) 

1004 if not issubclass(formatter_class, Formatter): 1004 ↛ 1005line 1004 didn't jump to line 1005, because the condition on line 1004 was never true

1005 raise TypeError(f"Requested formatter {dataset.formatter} is not a Formatter class.") 

1006 dataset.formatter = formatter_class 

1007 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

1008 filtered.append(dataset) 

1009 return _IngestPrepData(filtered) 

1010 

1011 @transactional 

1012 def _finishIngest( 

1013 self, 

1014 prepData: Datastore.IngestPrepData, 

1015 *, 

1016 transfer: Optional[str] = None, 

1017 record_validation_info: bool = True, 

1018 ) -> None: 

1019 # Docstring inherited from Datastore._finishIngest. 

1020 refsAndInfos = [] 

1021 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

1022 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

1023 # Do ingest as if the first dataset ref is associated with the file 

1024 info = self._extractIngestInfo( 

1025 dataset.path, 

1026 dataset.refs[0], 

1027 formatter=dataset.formatter, 

1028 transfer=transfer, 

1029 record_validation_info=record_validation_info, 

1030 ) 

1031 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

1032 self._register_datasets(refsAndInfos) 

1033 

1034 def _calculate_ingested_datastore_name( 

1035 self, srcUri: ResourcePath, ref: DatasetRef, formatter: Union[Formatter, Type[Formatter]] 

1036 ) -> Location: 

1037 """Given a source URI and a DatasetRef, determine the name the 

1038 dataset will have inside datastore. 

1039 

1040 Parameters 

1041 ---------- 

1042 srcUri : `lsst.resources.ResourcePath` 

1043 URI to the source dataset file. 

1044 ref : `DatasetRef` 

1045 Ref associated with the newly-ingested dataset artifact. This 

1046 is used to determine the name within the datastore. 

1047 formatter : `Formatter` or Formatter class. 

1048 Formatter to use for validation. Can be a class or an instance. 

1049 

1050 Returns 

1051 ------- 

1052 location : `Location` 

1053 Target location for the newly-ingested dataset. 

1054 """ 

1055 # Ingesting a file from outside the datastore. 

1056 # This involves a new name. 

1057 template = self.templates.getTemplate(ref) 

1058 location = self.locationFactory.fromPath(template.format(ref)) 

1059 

1060 # Get the extension 

1061 ext = srcUri.getExtension() 

1062 

1063 # Update the destination to include that extension 

1064 location.updateExtension(ext) 

1065 

1066 # Ask the formatter to validate this extension 

1067 formatter.validateExtension(location) 

1068 

1069 return location 

1070 

1071 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

1072 """Write out in memory dataset to datastore. 

1073 

1074 Parameters 

1075 ---------- 

1076 inMemoryDataset : `object` 

1077 Dataset to write to datastore. 

1078 ref : `DatasetRef` 

1079 Registry information associated with this dataset. 

1080 

1081 Returns 

1082 ------- 

1083 info : `StoredFileInfo` 

1084 Information describing the artifact written to the datastore. 

1085 """ 

1086 # May need to coerce the in memory dataset to the correct 

1087 # python type. 

1088 inMemoryDataset = ref.datasetType.storageClass.coerce_type(inMemoryDataset) 

1089 

1090 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

1091 uri = location.uri 

1092 

1093 if not uri.dirname().exists(): 

1094 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

1095 uri.dirname().mkdir() 

1096 

1097 if self._transaction is None: 1097 ↛ 1098line 1097 didn't jump to line 1098, because the condition on line 1097 was never true

1098 raise RuntimeError("Attempting to write artifact without transaction enabled") 

1099 

1100 def _removeFileExists(uri: ResourcePath) -> None: 

1101 """Remove a file and do not complain if it is not there. 

1102 

1103 This is important since a formatter might fail before the file 

1104 is written and we should not confuse people by writing spurious 

1105 error messages to the log. 

1106 """ 

1107 try: 

1108 uri.remove() 

1109 except FileNotFoundError: 

1110 pass 

1111 

1112 # Register a callback to try to delete the uploaded data if 

1113 # something fails below 

1114 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

1115 

1116 # For a local file, simply use the formatter directly 

1117 if uri.isLocal: 

1118 try: 

1119 formatter.write(inMemoryDataset) 

1120 except Exception as e: 

1121 raise RuntimeError( 

1122 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to location {uri}" 

1123 ) from e 

1124 log.debug("Successfully wrote python object to local file at %s", uri) 

1125 else: 

1126 # This is a remote URI. Some datasets can be serialized directly 

1127 # to bytes and sent to the remote datastore without writing a 

1128 # file. If the dataset is intended to be saved to the cache 

1129 # a file is always written and direct write to the remote 

1130 # datastore is bypassed. 

1131 data_written = False 

1132 if not self.cacheManager.should_be_cached(ref): 

1133 try: 

1134 serializedDataset = formatter.toBytes(inMemoryDataset) 

1135 except NotImplementedError: 

1136 # Fallback to the file writing option. 

1137 pass 

1138 except Exception as e: 

1139 raise RuntimeError( 

1140 f"Failed to serialize dataset {ref} of type {type(inMemoryDataset)} to bytes." 

1141 ) from e 

1142 else: 

1143 log.debug("Writing bytes directly to %s", uri) 

1144 uri.write(serializedDataset, overwrite=True) 

1145 log.debug("Successfully wrote bytes directly to %s", uri) 

1146 data_written = True 

1147 

1148 if not data_written: 

1149 # Did not write the bytes directly to object store so instead 

1150 # write to temporary file. 

1151 with ResourcePath.temporary_uri(suffix=uri.getExtension()) as temporary_uri: 

1152 # Need to configure the formatter to write to a different 

1153 # location and that needs us to overwrite internals 

1154 log.debug("Writing dataset to temporary location at %s", temporary_uri) 

1155 with formatter._updateLocation(Location(None, temporary_uri)): 

1156 try: 

1157 formatter.write(inMemoryDataset) 

1158 except Exception as e: 

1159 raise RuntimeError( 

1160 f"Failed to serialize dataset {ref} of type" 

1161 f" {type(inMemoryDataset)} to " 

1162 f"temporary location {temporary_uri}" 

1163 ) from e 

1164 uri.transfer_from(temporary_uri, transfer="copy", overwrite=True) 

1165 

1166 # Cache if required 

1167 self.cacheManager.move_to_cache(temporary_uri, ref) 

1168 

1169 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1170 

1171 # URI is needed to resolve what ingest case are we dealing with 

1172 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1173 

1174 def _read_artifact_into_memory( 

1175 self, 

1176 getInfo: DatastoreFileGetInformation, 

1177 ref: DatasetRef, 

1178 isComponent: bool = False, 

1179 cache_ref: Optional[DatasetRef] = None, 

1180 ) -> Any: 

1181 """Read the artifact from datastore into in memory object. 

1182 

1183 Parameters 

1184 ---------- 

1185 getInfo : `DatastoreFileGetInformation` 

1186 Information about the artifact within the datastore. 

1187 ref : `DatasetRef` 

1188 The registry information associated with this artifact. 

1189 isComponent : `bool` 

1190 Flag to indicate if a component is being read from this artifact. 

1191 cache_ref : `DatasetRef`, optional 

1192 The DatasetRef to use when looking up the file in the cache. 

1193 This ref must have the same ID as the supplied ref but can 

1194 be a parent ref or component ref to indicate to the cache whether 

1195 a composite file is being requested from the cache or a component 

1196 file. Without this the cache will default to the supplied ref but 

1197 it can get confused with read-only derived components for 

1198 disassembled composites. 

1199 

1200 Returns 

1201 ------- 

1202 inMemoryDataset : `object` 

1203 The artifact as a python object. 

1204 """ 

1205 location = getInfo.location 

1206 uri = location.uri 

1207 log.debug("Accessing data from %s", uri) 

1208 

1209 if cache_ref is None: 

1210 cache_ref = ref 

1211 if cache_ref.id != ref.id: 1211 ↛ 1212line 1211 didn't jump to line 1212, because the condition on line 1211 was never true

1212 raise ValueError( 

1213 "The supplied cache dataset ref refers to a different dataset than expected:" 

1214 f" {ref.id} != {cache_ref.id}" 

1215 ) 

1216 

1217 # Cannot recalculate checksum but can compare size as a quick check 

1218 # Do not do this if the size is negative since that indicates 

1219 # we do not know. 

1220 recorded_size = getInfo.info.file_size 

1221 resource_size = uri.size() 

1222 if recorded_size >= 0 and resource_size != recorded_size: 1222 ↛ 1223line 1222 didn't jump to line 1223, because the condition on line 1222 was never true

1223 raise RuntimeError( 

1224 "Integrity failure in Datastore. " 

1225 f"Size of file {uri} ({resource_size}) " 

1226 f"does not match size recorded in registry of {recorded_size}" 

1227 ) 

1228 

1229 # For the general case we have choices for how to proceed. 

1230 # 1. Always use a local file (downloading the remote resource to a 

1231 # temporary file if needed). 

1232 # 2. Use a threshold size and read into memory and use bytes. 

1233 # Use both for now with an arbitrary hand off size. 

1234 # This allows small datasets to be downloaded from remote object 

1235 # stores without requiring a temporary file. 

1236 

1237 formatter = getInfo.formatter 

1238 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1239 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1240 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1241 if cached_file is not None: 

1242 desired_uri = cached_file 

1243 msg = f" (cached version of {uri})" 

1244 else: 

1245 desired_uri = uri 

1246 msg = "" 

1247 with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): 

1248 serializedDataset = desired_uri.read() 

1249 log.debug( 

1250 "Deserializing %s from %d bytes from location %s with formatter %s", 

1251 f"component {getInfo.component}" if isComponent else "", 

1252 len(serializedDataset), 

1253 uri, 

1254 formatter.name(), 

1255 ) 

1256 try: 

1257 result = formatter.fromBytes( 

1258 serializedDataset, component=getInfo.component if isComponent else None 

1259 ) 

1260 except Exception as e: 

1261 raise ValueError( 

1262 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1263 f" ({ref.datasetType.name} from {uri}): {e}" 

1264 ) from e 

1265 else: 

1266 # Read from file. 

1267 

1268 # Have to update the Location associated with the formatter 

1269 # because formatter.read does not allow an override. 

1270 # This could be improved. 

1271 location_updated = False 

1272 msg = "" 

1273 

1274 # First check in cache for local version. 

1275 # The cache will only be relevant for remote resources but 

1276 # no harm in always asking. Context manager ensures that cache 

1277 # file is not deleted during cache expiration. 

1278 with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: 

1279 if cached_file is not None: 

1280 msg = f"(via cache read of remote file {uri})" 

1281 uri = cached_file 

1282 location_updated = True 

1283 

1284 with uri.as_local() as local_uri: 

1285 

1286 can_be_cached = False 

1287 if uri != local_uri: 1287 ↛ 1289line 1287 didn't jump to line 1289, because the condition on line 1287 was never true

1288 # URI was remote and file was downloaded 

1289 cache_msg = "" 

1290 location_updated = True 

1291 

1292 if self.cacheManager.should_be_cached(cache_ref): 

1293 # In this scenario we want to ask if the downloaded 

1294 # file should be cached but we should not cache 

1295 # it until after we've used it (to ensure it can't 

1296 # be expired whilst we are using it). 

1297 can_be_cached = True 

1298 

1299 # Say that it is "likely" to be cached because 

1300 # if the formatter read fails we will not be 

1301 # caching this file. 

1302 cache_msg = " and likely cached" 

1303 

1304 msg = f"(via download to local file{cache_msg})" 

1305 

1306 # Calculate the (possibly) new location for the formatter 

1307 # to use. 

1308 newLocation = Location(*local_uri.split()) if location_updated else None 

1309 

1310 log.debug( 

1311 "Reading%s from location %s %s with formatter %s", 

1312 f" component {getInfo.component}" if isComponent else "", 

1313 uri, 

1314 msg, 

1315 formatter.name(), 

1316 ) 

1317 try: 

1318 with formatter._updateLocation(newLocation): 

1319 with time_this( 

1320 log, 

1321 msg="Reading%s from location %s %s with formatter %s", 

1322 args=( 

1323 f" component {getInfo.component}" if isComponent else "", 

1324 uri, 

1325 msg, 

1326 formatter.name(), 

1327 ), 

1328 ): 

1329 result = formatter.read(component=getInfo.component if isComponent else None) 

1330 except Exception as e: 

1331 raise ValueError( 

1332 f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1333 f" ({ref.datasetType.name} from {uri}): {e}" 

1334 ) from e 

1335 

1336 # File was read successfully so can move to cache 

1337 if can_be_cached: 1337 ↛ 1338line 1337 didn't jump to line 1338, because the condition on line 1337 was never true

1338 self.cacheManager.move_to_cache(local_uri, cache_ref) 

1339 

1340 return self._post_process_get( 

1341 result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent 

1342 ) 

1343 

1344 def knows(self, ref: DatasetRef) -> bool: 

1345 """Check if the dataset is known to the datastore. 

1346 

1347 Does not check for existence of any artifact. 

1348 

1349 Parameters 

1350 ---------- 

1351 ref : `DatasetRef` 

1352 Reference to the required dataset. 

1353 

1354 Returns 

1355 ------- 

1356 exists : `bool` 

1357 `True` if the dataset is known to the datastore. 

1358 """ 

1359 fileLocations = self._get_dataset_locations_info(ref) 

1360 if fileLocations: 

1361 return True 

1362 return False 

1363 

1364 def _process_mexists_records( 

1365 self, 

1366 id_to_ref: Dict[DatasetId, DatasetRef], 

1367 records: Dict[DatasetId, List[StoredFileInfo]], 

1368 all_required: bool, 

1369 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

1370 ) -> Dict[DatasetRef, bool]: 

1371 """Helper function for mexists that checks the given records. 

1372 

1373 Parameters 

1374 ---------- 

1375 id_to_ref : `dict` of [`DatasetId`, `DatasetRef`] 

1376 Mapping of the dataset ID to the dataset ref itself. 

1377 records : `dict` of [`DatasetId`, `list` of `StoredFileInfo`] 

1378 Records as generally returned by 

1379 ``_get_stored_records_associated_with_refs``. 

1380 all_required : `bool` 

1381 Flag to indicate whether existence requires all artifacts 

1382 associated with a dataset ID to exist or not for existence. 

1383 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1384 Optional mapping of datastore artifact to existence. Updated by 

1385 this method with details of all artifacts tested. Can be `None` 

1386 if the caller is not interested. 

1387 

1388 Returns 

1389 ------- 

1390 existence : `dict` of [`DatasetRef`, `bool`] 

1391 Mapping from dataset to boolean indicating existence. 

1392 """ 

1393 # The URIs to be checked and a mapping of those URIs to 

1394 # the dataset ID. 

1395 uris_to_check: List[ResourcePath] = [] 

1396 location_map: Dict[ResourcePath, DatasetId] = {} 

1397 

1398 location_factory = self.locationFactory 

1399 

1400 uri_existence: Dict[ResourcePath, bool] = {} 

1401 for ref_id, infos in records.items(): 

1402 # Key is the dataset Id, value is list of StoredItemInfo 

1403 uris = [info.file_location(location_factory).uri for info in infos] 

1404 location_map.update({uri: ref_id for uri in uris}) 

1405 

1406 # Check the local cache directly for a dataset corresponding 

1407 # to the remote URI. 

1408 if self.cacheManager.file_count > 0: 

1409 ref = id_to_ref[ref_id] 

1410 for uri, storedFileInfo in zip(uris, infos): 

1411 check_ref = ref 

1412 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 1412 ↛ 1413line 1412 didn't jump to line 1413, because the condition on line 1412 was never true

1413 check_ref = ref.makeComponentRef(component) 

1414 if self.cacheManager.known_to_cache(check_ref, uri.getExtension()): 

1415 # Proxy for URI existence. 

1416 uri_existence[uri] = True 

1417 else: 

1418 uris_to_check.append(uri) 

1419 else: 

1420 # Check all of them. 

1421 uris_to_check.extend(uris) 

1422 

1423 if artifact_existence is not None: 

1424 # If a URI has already been checked remove it from the list 

1425 # and immediately add the status to the output dict. 

1426 filtered_uris_to_check = [] 

1427 for uri in uris_to_check: 

1428 if uri in artifact_existence: 

1429 uri_existence[uri] = artifact_existence[uri] 

1430 else: 

1431 filtered_uris_to_check.append(uri) 

1432 uris_to_check = filtered_uris_to_check 

1433 

1434 # Results. 

1435 dataset_existence: Dict[DatasetRef, bool] = {} 

1436 

1437 uri_existence.update(ResourcePath.mexists(uris_to_check)) 

1438 for uri, exists in uri_existence.items(): 

1439 dataset_id = location_map[uri] 

1440 ref = id_to_ref[dataset_id] 

1441 

1442 # Disassembled composite needs to check all locations. 

1443 # all_required indicates whether all need to exist or not. 

1444 if ref in dataset_existence: 

1445 if all_required: 

1446 exists = dataset_existence[ref] and exists 

1447 else: 

1448 exists = dataset_existence[ref] or exists 

1449 dataset_existence[ref] = exists 

1450 

1451 if artifact_existence is not None: 

1452 artifact_existence.update(uri_existence) 

1453 

1454 return dataset_existence 

1455 

1456 def mexists( 

1457 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1458 ) -> Dict[DatasetRef, bool]: 

1459 """Check the existence of multiple datasets at once. 

1460 

1461 Parameters 

1462 ---------- 

1463 refs : iterable of `DatasetRef` 

1464 The datasets to be checked. 

1465 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

1466 Optional mapping of datastore artifact to existence. Updated by 

1467 this method with details of all artifacts tested. Can be `None` 

1468 if the caller is not interested. 

1469 

1470 Returns 

1471 ------- 

1472 existence : `dict` of [`DatasetRef`, `bool`] 

1473 Mapping from dataset to boolean indicating existence. 

1474 

1475 Notes 

1476 ----- 

1477 To minimize potentially costly remote existence checks, the local 

1478 cache is checked as a proxy for existence. If a file for this 

1479 `DatasetRef` does exist no check is done for the actual URI. This 

1480 could result in possibly unexpected behavior if the dataset itself 

1481 has been removed from the datastore by another process whilst it is 

1482 still in the cache. 

1483 """ 

1484 chunk_size = 10_000 

1485 dataset_existence: Dict[DatasetRef, bool] = {} 

1486 log.debug("Checking for the existence of multiple artifacts in datastore in chunks of %d", chunk_size) 

1487 n_found_total = 0 

1488 n_checked = 0 

1489 n_chunks = 0 

1490 for chunk in chunk_iterable(refs, chunk_size=chunk_size): 

1491 chunk_result = self._mexists(chunk, artifact_existence) 

1492 if log.isEnabledFor(VERBOSE): 

1493 n_results = len(chunk_result) 

1494 n_checked += n_results 

1495 # Can treat the booleans as 0, 1 integers and sum them. 

1496 n_found = sum(chunk_result.values()) 

1497 n_found_total += n_found 

1498 log.verbose( 

1499 "Number of datasets found in datastore for chunk %d = %d/%d (running total: %d/%d)", 

1500 n_chunks, 

1501 n_found, 

1502 n_results, 

1503 n_found_total, 

1504 n_checked, 

1505 ) 

1506 dataset_existence.update(chunk_result) 

1507 n_chunks += 1 

1508 

1509 return dataset_existence 

1510 

1511 def _mexists( 

1512 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

1513 ) -> Dict[DatasetRef, bool]: 

1514 """Check the existence of multiple datasets at once. 

1515 

1516 Parameters 

1517 ---------- 

1518 refs : iterable of `DatasetRef` 

1519 The datasets to be checked. 

1520 

1521 Returns 

1522 ------- 

1523 existence : `dict` of [`DatasetRef`, `bool`] 

1524 Mapping from dataset to boolean indicating existence. 

1525 """ 

1526 # Need a mapping of dataset_id to dataset ref since the API 

1527 # works with dataset_id 

1528 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

1529 

1530 # Set of all IDs we are checking for. 

1531 requested_ids = set(id_to_ref.keys()) 

1532 

1533 # The records themselves. Could be missing some entries. 

1534 records = self._get_stored_records_associated_with_refs(refs) 

1535 

1536 dataset_existence = self._process_mexists_records( 

1537 id_to_ref, records, True, artifact_existence=artifact_existence 

1538 ) 

1539 

1540 # Set of IDs that have been handled. 

1541 handled_ids = {ref.id for ref in dataset_existence.keys()} 

1542 

1543 missing_ids = requested_ids - handled_ids 

1544 if missing_ids: 

1545 if not self.trustGetRequest: 

1546 # Must assume these do not exist 

1547 for missing in missing_ids: 

1548 dataset_existence[id_to_ref[missing]] = False 

1549 else: 

1550 log.debug( 

1551 "%d out of %d datasets were not known to datastore during initial existence check.", 

1552 len(missing_ids), 

1553 len(requested_ids), 

1554 ) 

1555 

1556 # Construct data structure identical to that returned 

1557 # by _get_stored_records_associated_with_refs() but using 

1558 # guessed names. 

1559 records = {} 

1560 for missing in missing_ids: 

1561 expected = self._get_expected_dataset_locations_info(id_to_ref[missing]) 

1562 records[missing] = [info for _, info in expected] 

1563 

1564 dataset_existence.update( 

1565 self._process_mexists_records( 

1566 id_to_ref, records, False, artifact_existence=artifact_existence 

1567 ) 

1568 ) 

1569 

1570 return dataset_existence 

1571 

1572 def exists(self, ref: DatasetRef) -> bool: 

1573 """Check if the dataset exists in the datastore. 

1574 

1575 Parameters 

1576 ---------- 

1577 ref : `DatasetRef` 

1578 Reference to the required dataset. 

1579 

1580 Returns 

1581 ------- 

1582 exists : `bool` 

1583 `True` if the entity exists in the `Datastore`. 

1584 

1585 Notes 

1586 ----- 

1587 The local cache is checked as a proxy for existence in the remote 

1588 object store. It is possible that another process on a different 

1589 compute node could remove the file from the object store even 

1590 though it is present in the local cache. 

1591 """ 

1592 fileLocations = self._get_dataset_locations_info(ref) 

1593 

1594 # if we are being asked to trust that registry might not be correct 

1595 # we ask for the expected locations and check them explicitly 

1596 if not fileLocations: 

1597 if not self.trustGetRequest: 

1598 return False 

1599 

1600 # First check the cache. If it is not found we must check 

1601 # the datastore itself. Assume that any component in the cache 

1602 # means that the dataset does exist somewhere. 

1603 if self.cacheManager.known_to_cache(ref): 1603 ↛ 1604line 1603 didn't jump to line 1604, because the condition on line 1603 was never true

1604 return True 

1605 

1606 # When we are guessing a dataset location we can not check 

1607 # for the existence of every component since we can not 

1608 # know if every component was written. Instead we check 

1609 # for the existence of any of the expected locations. 

1610 for location, _ in self._get_expected_dataset_locations_info(ref): 1610 ↛ 1613line 1610 didn't jump to line 1613, because the loop on line 1610 didn't complete

1611 if self._artifact_exists(location): 1611 ↛ 1610line 1611 didn't jump to line 1610, because the condition on line 1611 was never false

1612 return True 

1613 return False 

1614 

1615 # All listed artifacts must exist. 

1616 for location, storedFileInfo in fileLocations: 

1617 # Checking in cache needs the component ref. 

1618 check_ref = ref 

1619 if not ref.datasetType.isComponent() and (component := storedFileInfo.component): 

1620 check_ref = ref.makeComponentRef(component) 

1621 if self.cacheManager.known_to_cache(check_ref, location.getExtension()): 

1622 continue 

1623 

1624 if not self._artifact_exists(location): 

1625 return False 

1626 

1627 return True 

1628 

1629 def getURIs( 

1630 self, ref: DatasetRef, predict: bool = False 

1631 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1632 """Return URIs associated with dataset. 

1633 

1634 Parameters 

1635 ---------- 

1636 ref : `DatasetRef` 

1637 Reference to the required dataset. 

1638 predict : `bool`, optional 

1639 If the datastore does not know about the dataset, should it 

1640 return a predicted URI or not? 

1641 

1642 Returns 

1643 ------- 

1644 primary : `lsst.resources.ResourcePath` 

1645 The URI to the primary artifact associated with this dataset. 

1646 If the dataset was disassembled within the datastore this 

1647 may be `None`. 

1648 components : `dict` 

1649 URIs to any components associated with the dataset artifact. 

1650 Can be empty if there are no components. 

1651 """ 

1652 

1653 primary: Optional[ResourcePath] = None 

1654 components: Dict[str, ResourcePath] = {} 

1655 

1656 # if this has never been written then we have to guess 

1657 if not self.exists(ref): 

1658 if not predict: 

1659 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1660 

1661 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1662 

1663 if doDisassembly: 

1664 

1665 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1666 compRef = ref.makeComponentRef(component) 

1667 compLocation, _ = self._determine_put_formatter_location(compRef) 

1668 

1669 # Add a URI fragment to indicate this is a guess 

1670 components[component] = ResourcePath(compLocation.uri.geturl() + "#predicted") 

1671 

1672 else: 

1673 

1674 location, _ = self._determine_put_formatter_location(ref) 

1675 

1676 # Add a URI fragment to indicate this is a guess 

1677 primary = ResourcePath(location.uri.geturl() + "#predicted") 

1678 

1679 return primary, components 

1680 

1681 # If this is a ref that we have written we can get the path. 

1682 # Get file metadata and internal metadata 

1683 fileLocations = self._get_dataset_locations_info(ref) 

1684 

1685 guessing = False 

1686 if not fileLocations: 

1687 if not self.trustGetRequest: 1687 ↛ 1688line 1687 didn't jump to line 1688, because the condition on line 1687 was never true

1688 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1689 fileLocations = self._get_expected_dataset_locations_info(ref) 

1690 guessing = True 

1691 

1692 if len(fileLocations) == 1: 

1693 # No disassembly so this is the primary URI 

1694 uri = fileLocations[0][0].uri 

1695 if guessing and not uri.exists(): 1695 ↛ 1696line 1695 didn't jump to line 1696, because the condition on line 1695 was never true

1696 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1697 primary = uri 

1698 

1699 else: 

1700 for location, storedFileInfo in fileLocations: 

1701 if storedFileInfo.component is None: 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true

1702 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1703 uri = location.uri 

1704 if guessing and not uri.exists(): 1704 ↛ 1708line 1704 didn't jump to line 1708, because the condition on line 1704 was never true

1705 # If we are trusting then it is entirely possible for 

1706 # some components to be missing. In that case we skip 

1707 # to the next component. 

1708 if self.trustGetRequest: 

1709 continue 

1710 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1711 components[storedFileInfo.component] = uri 

1712 

1713 return primary, components 

1714 

1715 def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: 

1716 """URI to the Dataset. 

1717 

1718 Parameters 

1719 ---------- 

1720 ref : `DatasetRef` 

1721 Reference to the required Dataset. 

1722 predict : `bool` 

1723 If `True`, allow URIs to be returned of datasets that have not 

1724 been written. 

1725 

1726 Returns 

1727 ------- 

1728 uri : `str` 

1729 URI pointing to the dataset within the datastore. If the 

1730 dataset does not exist in the datastore, and if ``predict`` is 

1731 `True`, the URI will be a prediction and will include a URI 

1732 fragment "#predicted". 

1733 If the datastore does not have entities that relate well 

1734 to the concept of a URI the returned URI will be 

1735 descriptive. The returned URI is not guaranteed to be obtainable. 

1736 

1737 Raises 

1738 ------ 

1739 FileNotFoundError 

1740 Raised if a URI has been requested for a dataset that does not 

1741 exist and guessing is not allowed. 

1742 RuntimeError 

1743 Raised if a request is made for a single URI but multiple URIs 

1744 are associated with this dataset. 

1745 

1746 Notes 

1747 ----- 

1748 When a predicted URI is requested an attempt will be made to form 

1749 a reasonable URI based on file templates and the expected formatter. 

1750 """ 

1751 primary, components = self.getURIs(ref, predict) 

1752 if primary is None or components: 1752 ↛ 1753line 1752 didn't jump to line 1753, because the condition on line 1752 was never true

1753 raise RuntimeError( 

1754 f"Dataset ({ref}) includes distinct URIs for components. Use Datastore.getURIs() instead." 

1755 ) 

1756 return primary 

1757 

1758 def retrieveArtifacts( 

1759 self, 

1760 refs: Iterable[DatasetRef], 

1761 destination: ResourcePath, 

1762 transfer: str = "auto", 

1763 preserve_path: bool = True, 

1764 overwrite: bool = False, 

1765 ) -> List[ResourcePath]: 

1766 """Retrieve the file artifacts associated with the supplied refs. 

1767 

1768 Parameters 

1769 ---------- 

1770 refs : iterable of `DatasetRef` 

1771 The datasets for which file artifacts are to be retrieved. 

1772 A single ref can result in multiple files. The refs must 

1773 be resolved. 

1774 destination : `lsst.resources.ResourcePath` 

1775 Location to write the file artifacts. 

1776 transfer : `str`, optional 

1777 Method to use to transfer the artifacts. Must be one of the options 

1778 supported by `lsst.resources.ResourcePath.transfer_from()`. 

1779 "move" is not allowed. 

1780 preserve_path : `bool`, optional 

1781 If `True` the full path of the file artifact within the datastore 

1782 is preserved. If `False` the final file component of the path 

1783 is used. 

1784 overwrite : `bool`, optional 

1785 If `True` allow transfers to overwrite existing files at the 

1786 destination. 

1787 

1788 Returns 

1789 ------- 

1790 targets : `list` of `lsst.resources.ResourcePath` 

1791 URIs of file artifacts in destination location. Order is not 

1792 preserved. 

1793 """ 

1794 if not destination.isdir(): 1794 ↛ 1795line 1794 didn't jump to line 1795, because the condition on line 1794 was never true

1795 raise ValueError(f"Destination location must refer to a directory. Given {destination}") 

1796 

1797 if transfer == "move": 

1798 raise ValueError("Can not move artifacts out of datastore. Use copy instead.") 

1799 

1800 # Source -> Destination 

1801 # This also helps filter out duplicate DatasetRef in the request 

1802 # that will map to the same underlying file transfer. 

1803 to_transfer: Dict[ResourcePath, ResourcePath] = {} 

1804 

1805 for ref in refs: 

1806 locations = self._get_dataset_locations_info(ref) 

1807 for location, _ in locations: 

1808 source_uri = location.uri 

1809 target_path: ResourcePathExpression 

1810 if preserve_path: 

1811 target_path = location.pathInStore 

1812 if target_path.isabs(): 1812 ↛ 1815line 1812 didn't jump to line 1815, because the condition on line 1812 was never true

1813 # This is an absolute path to an external file. 

1814 # Use the full path. 

1815 target_path = target_path.relativeToPathRoot 

1816 else: 

1817 target_path = source_uri.basename() 

1818 target_uri = destination.join(target_path) 

1819 to_transfer[source_uri] = target_uri 

1820 

1821 # In theory can now parallelize the transfer 

1822 log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) 

1823 for source_uri, target_uri in to_transfer.items(): 

1824 target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) 

1825 

1826 return list(to_transfer.values()) 

1827 

1828 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1829 """Load an InMemoryDataset from the store. 

1830 

1831 Parameters 

1832 ---------- 

1833 ref : `DatasetRef` 

1834 Reference to the required Dataset. 

1835 parameters : `dict` 

1836 `StorageClass`-specific parameters that specify, for example, 

1837 a slice of the dataset to be loaded. 

1838 

1839 Returns 

1840 ------- 

1841 inMemoryDataset : `object` 

1842 Requested dataset or slice thereof as an InMemoryDataset. 

1843 

1844 Raises 

1845 ------ 

1846 FileNotFoundError 

1847 Requested dataset can not be retrieved. 

1848 TypeError 

1849 Return value from formatter has unexpected type. 

1850 ValueError 

1851 Formatter failed to process the dataset. 

1852 """ 

1853 allGetInfo = self._prepare_for_get(ref, parameters) 

1854 refComponent = ref.datasetType.component() 

1855 

1856 # Supplied storage class for the component being read 

1857 refStorageClass = ref.datasetType.storageClass 

1858 

1859 # Create mapping from component name to related info 

1860 allComponents = {i.component: i for i in allGetInfo} 

1861 

1862 # By definition the dataset is disassembled if we have more 

1863 # than one record for it. 

1864 isDisassembled = len(allGetInfo) > 1 

1865 

1866 # Look for the special case where we are disassembled but the 

1867 # component is a derived component that was not written during 

1868 # disassembly. For this scenario we need to check that the 

1869 # component requested is listed as a derived component for the 

1870 # composite storage class 

1871 isDisassembledReadOnlyComponent = False 

1872 if isDisassembled and refComponent: 

1873 # The composite storage class should be accessible through 

1874 # the component dataset type 

1875 compositeStorageClass = ref.datasetType.parentStorageClass 

1876 

1877 # In the unlikely scenario where the composite storage 

1878 # class is not known, we can only assume that this is a 

1879 # normal component. If that assumption is wrong then the 

1880 # branch below that reads a persisted component will fail 

1881 # so there is no need to complain here. 

1882 if compositeStorageClass is not None: 1882 ↛ 1885line 1882 didn't jump to line 1885, because the condition on line 1882 was never false

1883 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1884 

1885 if isDisassembled and not refComponent: 

1886 # This was a disassembled dataset spread over multiple files 

1887 # and we need to put them all back together again. 

1888 # Read into memory and then assemble 

1889 

1890 # Check that the supplied parameters are suitable for the type read 

1891 refStorageClass.validateParameters(parameters) 

1892 

1893 # We want to keep track of all the parameters that were not used 

1894 # by formatters. We assume that if any of the component formatters 

1895 # use a parameter that we do not need to apply it again in the 

1896 # assembler. 

1897 usedParams = set() 

1898 

1899 components: Dict[str, Any] = {} 

1900 for getInfo in allGetInfo: 

1901 # assemblerParams are parameters not understood by the 

1902 # associated formatter. 

1903 usedParams.update(set(getInfo.formatterParams)) 

1904 

1905 component = getInfo.component 

1906 

1907 if component is None: 1907 ↛ 1908line 1907 didn't jump to line 1908, because the condition on line 1907 was never true

1908 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1909 

1910 # We do not want the formatter to think it's reading 

1911 # a component though because it is really reading a 

1912 # standalone dataset -- always tell reader it is not a 

1913 # component. 

1914 components[component] = self._read_artifact_into_memory( 

1915 getInfo, ref.makeComponentRef(component), isComponent=False 

1916 ) 

1917 

1918 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1919 

1920 # Any unused parameters will have to be passed to the assembler 

1921 if parameters: 

1922 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1923 else: 

1924 unusedParams = {} 

1925 

1926 # Process parameters 

1927 return ref.datasetType.storageClass.delegate().handleParameters( 

1928 inMemoryDataset, parameters=unusedParams 

1929 ) 

1930 

1931 elif isDisassembledReadOnlyComponent: 

1932 

1933 compositeStorageClass = ref.datasetType.parentStorageClass 

1934 if compositeStorageClass is None: 1934 ↛ 1935line 1934 didn't jump to line 1935, because the condition on line 1934 was never true

1935 raise RuntimeError( 

1936 f"Unable to retrieve derived component '{refComponent}' since" 

1937 "no composite storage class is available." 

1938 ) 

1939 

1940 if refComponent is None: 1940 ↛ 1942line 1940 didn't jump to line 1942, because the condition on line 1940 was never true

1941 # Mainly for mypy 

1942 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1943 

1944 # Assume that every derived component can be calculated by 

1945 # forwarding the request to a single read/write component. 

1946 # Rather than guessing which rw component is the right one by 

1947 # scanning each for a derived component of the same name, 

1948 # we ask the storage class delegate directly which one is best to 

1949 # use. 

1950 compositeDelegate = compositeStorageClass.delegate() 

1951 forwardedComponent = compositeDelegate.selectResponsibleComponent( 

1952 refComponent, set(allComponents) 

1953 ) 

1954 

1955 # Select the relevant component 

1956 rwInfo = allComponents[forwardedComponent] 

1957 

1958 # For now assume that read parameters are validated against 

1959 # the real component and not the requested component 

1960 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1961 forwardedStorageClass.validateParameters(parameters) 

1962 

1963 # The reference to use for the caching must refer to the forwarded 

1964 # component and not the derived component. 

1965 cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) 

1966 

1967 # Unfortunately the FileDescriptor inside the formatter will have 

1968 # the wrong write storage class so we need to create a new one 

1969 # given the immutability constraint. 

1970 writeStorageClass = rwInfo.info.storageClass 

1971 

1972 # We may need to put some thought into parameters for read 

1973 # components but for now forward them on as is 

1974 readFormatter = type(rwInfo.formatter)( 

1975 FileDescriptor( 

1976 rwInfo.location, 

1977 readStorageClass=refStorageClass, 

1978 storageClass=writeStorageClass, 

1979 parameters=parameters, 

1980 ), 

1981 ref.dataId, 

1982 ) 

1983 

1984 # The assembler can not receive any parameter requests for a 

1985 # derived component at this time since the assembler will 

1986 # see the storage class of the derived component and those 

1987 # parameters will have to be handled by the formatter on the 

1988 # forwarded storage class. 

1989 assemblerParams: Dict[str, Any] = {} 

1990 

1991 # Need to created a new info that specifies the derived 

1992 # component and associated storage class 

1993 readInfo = DatastoreFileGetInformation( 

1994 rwInfo.location, 

1995 readFormatter, 

1996 rwInfo.info, 

1997 assemblerParams, 

1998 {}, 

1999 refComponent, 

2000 refStorageClass, 

2001 ) 

2002 

2003 return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) 

2004 

2005 else: 

2006 # Single file request or component from that composite file 

2007 for lookup in (refComponent, None): 2007 ↛ 2012line 2007 didn't jump to line 2012, because the loop on line 2007 didn't complete

2008 if lookup in allComponents: 2008 ↛ 2007line 2008 didn't jump to line 2007, because the condition on line 2008 was never false

2009 getInfo = allComponents[lookup] 

2010 break 

2011 else: 

2012 raise FileNotFoundError( 

2013 f"Component {refComponent} not found for ref {ref} in datastore {self.name}" 

2014 ) 

2015 

2016 # Do not need the component itself if already disassembled 

2017 if isDisassembled: 

2018 isComponent = False 

2019 else: 

2020 isComponent = getInfo.component is not None 

2021 

2022 # For a component read of a composite we want the cache to 

2023 # be looking at the composite ref itself. 

2024 cache_ref = ref.makeCompositeRef() if isComponent else ref 

2025 

2026 # For a disassembled component we can validate parametersagainst 

2027 # the component storage class directly 

2028 if isDisassembled: 

2029 refStorageClass.validateParameters(parameters) 

2030 else: 

2031 # For an assembled composite this could be a derived 

2032 # component derived from a real component. The validity 

2033 # of the parameters is not clear. For now validate against 

2034 # the composite storage class 

2035 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

2036 

2037 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) 

2038 

2039 @transactional 

2040 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

2041 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

2042 

2043 Parameters 

2044 ---------- 

2045 inMemoryDataset : `object` 

2046 The dataset to store. 

2047 ref : `DatasetRef` 

2048 Reference to the associated Dataset. 

2049 

2050 Raises 

2051 ------ 

2052 TypeError 

2053 Supplied object and storage class are inconsistent. 

2054 DatasetTypeNotSupportedError 

2055 The associated `DatasetType` is not handled by this datastore. 

2056 

2057 Notes 

2058 ----- 

2059 If the datastore is configured to reject certain dataset types it 

2060 is possible that the put will fail and raise a 

2061 `DatasetTypeNotSupportedError`. The main use case for this is to 

2062 allow `ChainedDatastore` to put to multiple datastores without 

2063 requiring that every datastore accepts the dataset. 

2064 """ 

2065 

2066 doDisassembly = self.composites.shouldBeDisassembled(ref) 

2067 # doDisassembly = True 

2068 

2069 artifacts = [] 

2070 if doDisassembly: 

2071 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

2072 for component, componentInfo in components.items(): 

2073 # Don't recurse because we want to take advantage of 

2074 # bulk insert -- need a new DatasetRef that refers to the 

2075 # same dataset_id but has the component DatasetType 

2076 # DatasetType does not refer to the types of components 

2077 # So we construct one ourselves. 

2078 compRef = ref.makeComponentRef(component) 

2079 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

2080 artifacts.append((compRef, storedInfo)) 

2081 else: 

2082 # Write the entire thing out 

2083 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

2084 artifacts.append((ref, storedInfo)) 

2085 

2086 self._register_datasets(artifacts) 

2087 

2088 @transactional 

2089 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

2090 # At this point can safely remove these datasets from the cache 

2091 # to avoid confusion later on. If they are not trashed later 

2092 # the cache will simply be refilled. 

2093 self.cacheManager.remove_from_cache(ref) 

2094 

2095 # If we are in trust mode there will be nothing to move to 

2096 # the trash table and we will have to try to delete the file 

2097 # immediately. 

2098 if self.trustGetRequest: 

2099 # Try to keep the logic below for a single file trash. 

2100 if isinstance(ref, DatasetRef): 

2101 refs = {ref} 

2102 else: 

2103 # Will recreate ref at the end of this branch. 

2104 refs = set(ref) 

2105 

2106 # Determine which datasets are known to datastore directly. 

2107 id_to_ref = {ref.getCheckedId(): ref for ref in refs} 

2108 existing_ids = self._get_stored_records_associated_with_refs(refs) 

2109 existing_refs = {id_to_ref[ref_id] for ref_id in existing_ids} 

2110 

2111 missing = refs - existing_refs 

2112 if missing: 

2113 # Do an explicit existence check on these refs. 

2114 # We only care about the artifacts at this point and not 

2115 # the dataset existence. 

2116 artifact_existence: Dict[ResourcePath, bool] = {} 

2117 _ = self.mexists(missing, artifact_existence) 

2118 uris = [uri for uri, exists in artifact_existence.items() if exists] 

2119 

2120 # FUTURE UPGRADE: Implement a parallelized bulk remove. 

2121 log.debug("Removing %d artifacts from datastore that are unknown to datastore", len(uris)) 

2122 for uri in uris: 

2123 try: 

2124 uri.remove() 

2125 except Exception as e: 

2126 if ignore_errors: 

2127 log.debug("Artifact %s could not be removed: %s", uri, e) 

2128 continue 

2129 raise 

2130 

2131 # There is no point asking the code below to remove refs we 

2132 # know are missing so update it with the list of existing 

2133 # records. Try to retain one vs many logic. 

2134 if not existing_refs: 

2135 # Nothing more to do since none of the datasets were 

2136 # known to the datastore record table. 

2137 return 

2138 ref = list(existing_refs) 

2139 if len(ref) == 1: 

2140 ref = ref[0] 

2141 

2142 # Get file metadata and internal metadata 

2143 if not isinstance(ref, DatasetRef): 

2144 log.debug("Doing multi-dataset trash in datastore %s", self.name) 

2145 # Assumed to be an iterable of refs so bulk mode enabled. 

2146 try: 

2147 self.bridge.moveToTrash(ref) 

2148 except Exception as e: 

2149 if ignore_errors: 

2150 log.warning("Unexpected issue moving multiple datasets to trash: %s", e) 

2151 else: 

2152 raise 

2153 return 

2154 

2155 log.debug("Trashing dataset %s in datastore %s", ref, self.name) 

2156 

2157 fileLocations = self._get_dataset_locations_info(ref) 

2158 

2159 if not fileLocations: 

2160 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

2161 if ignore_errors: 

2162 log.warning(err_msg) 

2163 return 

2164 else: 

2165 raise FileNotFoundError(err_msg) 

2166 

2167 for location, storedFileInfo in fileLocations: 

2168 if not self._artifact_exists(location): 2168 ↛ 2169line 2168 didn't jump to line 2169

2169 err_msg = ( 

2170 f"Dataset is known to datastore {self.name} but " 

2171 f"associated artifact ({location.uri}) is missing" 

2172 ) 

2173 if ignore_errors: 

2174 log.warning(err_msg) 

2175 return 

2176 else: 

2177 raise FileNotFoundError(err_msg) 

2178 

2179 # Mark dataset as trashed 

2180 try: 

2181 self.bridge.moveToTrash([ref]) 

2182 except Exception as e: 

2183 if ignore_errors: 

2184 log.warning( 

2185 "Attempted to mark dataset (%s) to be trashed in datastore %s " 

2186 "but encountered an error: %s", 

2187 ref, 

2188 self.name, 

2189 e, 

2190 ) 

2191 pass 

2192 else: 

2193 raise 

2194 

2195 @transactional 

2196 def emptyTrash(self, ignore_errors: bool = True) -> None: 

2197 """Remove all datasets from the trash. 

2198 

2199 Parameters 

2200 ---------- 

2201 ignore_errors : `bool` 

2202 If `True` return without error even if something went wrong. 

2203 Problems could occur if another process is simultaneously trying 

2204 to delete. 

2205 """ 

2206 log.debug("Emptying trash in datastore %s", self.name) 

2207 

2208 # Context manager will empty trash iff we finish it without raising. 

2209 # It will also automatically delete the relevant rows from the 

2210 # trash table and the records table. 

2211 with self.bridge.emptyTrash( 

2212 self._table, record_class=StoredFileInfo, record_column="path" 

2213 ) as trash_data: 

2214 # Removing the artifacts themselves requires that the files are 

2215 # not also associated with refs that are not to be trashed. 

2216 # Therefore need to do a query with the file paths themselves 

2217 # and return all the refs associated with them. Can only delete 

2218 # a file if the refs to be trashed are the only refs associated 

2219 # with the file. 

2220 # This requires multiple copies of the trashed items 

2221 trashed, artifacts_to_keep = trash_data 

2222 

2223 if artifacts_to_keep is None: 

2224 # The bridge is not helping us so have to work it out 

2225 # ourselves. This is not going to be as efficient. 

2226 trashed = list(trashed) 

2227 

2228 # The instance check is for mypy since up to this point it 

2229 # does not know the type of info. 

2230 path_map = self._refs_associated_with_artifacts( 

2231 [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] 

2232 ) 

2233 

2234 for ref, info in trashed: 

2235 

2236 # Mypy needs to know this is not the base class 

2237 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2238 

2239 # Check for mypy 

2240 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2241 

2242 path_map[info.path].remove(ref.id) 

2243 if not path_map[info.path]: 2243 ↛ 2234line 2243 didn't jump to line 2234, because the condition on line 2243 was never false

2244 del path_map[info.path] 

2245 

2246 artifacts_to_keep = set(path_map) 

2247 

2248 for ref, info in trashed: 

2249 

2250 # Should not happen for this implementation but need 

2251 # to keep mypy happy. 

2252 assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." 

2253 

2254 # Mypy needs to know this is not the base class 

2255 assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" 

2256 

2257 # Check for mypy 

2258 assert ref.id is not None, f"Internal logic error in emptyTrash with ref {ref}/{info}" 

2259 

2260 if info.path in artifacts_to_keep: 

2261 # This is a multi-dataset artifact and we are not 

2262 # removing all associated refs. 

2263 continue 

2264 

2265 # Only trashed refs still known to datastore will be returned. 

2266 location = info.file_location(self.locationFactory) 

2267 

2268 # Point of no return for this artifact 

2269 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

2270 try: 

2271 self._delete_artifact(location) 

2272 except FileNotFoundError: 

2273 # If the file itself has been deleted there is nothing 

2274 # we can do about it. It is possible that trash has 

2275 # been run in parallel in another process or someone 

2276 # decided to delete the file. It is unlikely to come 

2277 # back and so we should still continue with the removal 

2278 # of the entry from the trash table. It is also possible 

2279 # we removed it in a previous iteration if it was 

2280 # a multi-dataset artifact. The delete artifact method 

2281 # will log a debug message in this scenario. 

2282 # Distinguishing file missing before trash started and 

2283 # file already removed previously as part of this trash 

2284 # is not worth the distinction with regards to potential 

2285 # memory cost. 

2286 pass 

2287 except Exception as e: 

2288 if ignore_errors: 

2289 # Use a debug message here even though it's not 

2290 # a good situation. In some cases this can be 

2291 # caused by a race between user A and user B 

2292 # and neither of them has permissions for the 

2293 # other's files. Butler does not know about users 

2294 # and trash has no idea what collections these 

2295 # files were in (without guessing from a path). 

2296 log.debug( 

2297 "Encountered error removing artifact %s from datastore %s: %s", 

2298 location.uri, 

2299 self.name, 

2300 e, 

2301 ) 

2302 else: 

2303 raise 

2304 

2305 @transactional 

2306 def transfer_from( 

2307 self, 

2308 source_datastore: Datastore, 

2309 refs: Iterable[DatasetRef], 

2310 local_refs: Optional[Iterable[DatasetRef]] = None, 

2311 transfer: str = "auto", 

2312 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

2313 ) -> None: 

2314 # Docstring inherited 

2315 if type(self) is not type(source_datastore): 

2316 raise TypeError( 

2317 f"Datastore mismatch between this datastore ({type(self)}) and the " 

2318 f"source datastore ({type(source_datastore)})." 

2319 ) 

2320 

2321 # Be explicit for mypy 

2322 if not isinstance(source_datastore, FileDatastore): 2322 ↛ 2323line 2322 didn't jump to line 2323, because the condition on line 2322 was never true

2323 raise TypeError( 

2324 "Can only transfer to a FileDatastore from another FileDatastore, not" 

2325 f" {type(source_datastore)}" 

2326 ) 

2327 

2328 # Stop early if "direct" transfer mode is requested. That would 

2329 # require that the URI inside the source datastore should be stored 

2330 # directly in the target datastore, which seems unlikely to be useful 

2331 # since at any moment the source datastore could delete the file. 

2332 if transfer in ("direct", "split"): 

2333 raise ValueError( 

2334 f"Can not transfer from a source datastore using {transfer} mode since" 

2335 " those files are controlled by the other datastore." 

2336 ) 

2337 

2338 # Empty existence lookup if none given. 

2339 if artifact_existence is None: 

2340 artifact_existence = {} 

2341 

2342 # We will go through the list multiple times so must convert 

2343 # generators to lists. 

2344 refs = list(refs) 

2345 

2346 if local_refs is None: 

2347 local_refs = refs 

2348 else: 

2349 local_refs = list(local_refs) 

2350 

2351 # In order to handle disassembled composites the code works 

2352 # at the records level since it can assume that internal APIs 

2353 # can be used. 

2354 # - If the record already exists in the destination this is assumed 

2355 # to be okay. 

2356 # - If there is no record but the source and destination URIs are 

2357 # identical no transfer is done but the record is added. 

2358 # - If the source record refers to an absolute URI currently assume 

2359 # that that URI should remain absolute and will be visible to the 

2360 # destination butler. May need to have a flag to indicate whether 

2361 # the dataset should be transferred. This will only happen if 

2362 # the detached Butler has had a local ingest. 

2363 

2364 # What we really want is all the records in the source datastore 

2365 # associated with these refs. Or derived ones if they don't exist 

2366 # in the source. 

2367 source_records = source_datastore._get_stored_records_associated_with_refs(refs) 

2368 

2369 # The source dataset_ids are the keys in these records 

2370 source_ids = set(source_records) 

2371 log.debug("Number of datastore records found in source: %d", len(source_ids)) 

2372 

2373 # The not None check is to appease mypy 

2374 requested_ids = set(ref.id for ref in refs if ref.id is not None) 

2375 missing_ids = requested_ids - source_ids 

2376 

2377 # Missing IDs can be okay if that datastore has allowed 

2378 # gets based on file existence. Should we transfer what we can 

2379 # or complain about it and warn? 

2380 if missing_ids and not source_datastore.trustGetRequest: 2380 ↛ 2381line 2380 didn't jump to line 2381, because the condition on line 2380 was never true

2381 raise ValueError( 

2382 f"Some datasets are missing from source datastore {source_datastore}: {missing_ids}" 

2383 ) 

2384 

2385 # Need to map these missing IDs to a DatasetRef so we can guess 

2386 # the details. 

2387 if missing_ids: 

2388 log.info( 

2389 "Number of expected datasets missing from source datastore records: %d out of %d", 

2390 len(missing_ids), 

2391 len(requested_ids), 

2392 ) 

2393 id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} 

2394 

2395 # This should be chunked in case we end up having to check 

2396 # the file store since we need some log output to show 

2397 # progress. 

2398 for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): 

2399 records = {} 

2400 for missing in missing_ids_chunk: 

2401 # Ask the source datastore where the missing artifacts 

2402 # should be. An execution butler might not know about the 

2403 # artifacts even if they are there. 

2404 expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) 

2405 records[missing] = [info for _, info in expected] 

2406 

2407 # Call the mexist helper method in case we have not already 

2408 # checked these artifacts such that artifact_existence is 

2409 # empty. This allows us to benefit from parallelism. 

2410 # datastore.mexists() itself does not give us access to the 

2411 # derived datastore record. 

2412 log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) 

2413 ref_exists = source_datastore._process_mexists_records( 

2414 id_to_ref, records, False, artifact_existence=artifact_existence 

2415 ) 

2416 

2417 # Now go through the records and propagate the ones that exist. 

2418 location_factory = source_datastore.locationFactory 

2419 for missing, record_list in records.items(): 

2420 # Skip completely if the ref does not exist. 

2421 ref = id_to_ref[missing] 

2422 if not ref_exists[ref]: 

2423 log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) 

2424 continue 

2425 # Check for file artifact to decide which parts of a 

2426 # disassembled composite do exist. If there is only a 

2427 # single record we don't even need to look because it can't 

2428 # be a composite and must exist. 

2429 if len(record_list) == 1: 

2430 dataset_records = record_list 

2431 else: 

2432 dataset_records = [ 

2433 record 

2434 for record in record_list 

2435 if artifact_existence[record.file_location(location_factory).uri] 

2436 ] 

2437 assert len(dataset_records) > 0, "Disassembled composite should have had some files." 

2438 

2439 # Rely on source_records being a defaultdict. 

2440 source_records[missing].extend(dataset_records) 

2441 

2442 # See if we already have these records 

2443 target_records = self._get_stored_records_associated_with_refs(local_refs) 

2444 

2445 # The artifacts to register 

2446 artifacts = [] 

2447 

2448 # Refs that already exist 

2449 already_present = [] 

2450 

2451 # Now can transfer the artifacts 

2452 for source_ref, target_ref in zip(refs, local_refs): 

2453 if target_ref.id in target_records: 

2454 # Already have an artifact for this. 

2455 already_present.append(target_ref) 

2456 continue 

2457 

2458 # mypy needs to know these are always resolved refs 

2459 for info in source_records[source_ref.getCheckedId()]: 

2460 source_location = info.file_location(source_datastore.locationFactory) 

2461 target_location = info.file_location(self.locationFactory) 

2462 if source_location == target_location: 2462 ↛ 2466line 2462 didn't jump to line 2466, because the condition on line 2462 was never true

2463 # Either the dataset is already in the target datastore 

2464 # (which is how execution butler currently runs) or 

2465 # it is an absolute URI. 

2466 if source_location.pathInStore.isabs(): 

2467 # Just because we can see the artifact when running 

2468 # the transfer doesn't mean it will be generally 

2469 # accessible to a user of this butler. For now warn 

2470 # but assume it will be accessible. 

2471 log.warning( 

2472 "Transfer request for an outside-datastore artifact has been found at %s", 

2473 source_location, 

2474 ) 

2475 else: 

2476 # Need to transfer it to the new location. 

2477 # Assume we should always overwrite. If the artifact 

2478 # is there this might indicate that a previous transfer 

2479 # was interrupted but was not able to be rolled back 

2480 # completely (eg pre-emption) so follow Datastore default 

2481 # and overwrite. 

2482 target_location.uri.transfer_from( 

2483 source_location.uri, transfer=transfer, overwrite=True, transaction=self._transaction 

2484 ) 

2485 

2486 artifacts.append((target_ref, info)) 

2487 

2488 self._register_datasets(artifacts) 

2489 

2490 if already_present: 

2491 n_skipped = len(already_present) 

2492 log.info( 

2493 "Skipped transfer of %d dataset%s already present in datastore", 

2494 n_skipped, 

2495 "" if n_skipped == 1 else "s", 

2496 ) 

2497 

2498 @transactional 

2499 def forget(self, refs: Iterable[DatasetRef]) -> None: 

2500 # Docstring inherited. 

2501 refs = list(refs) 

2502 self.bridge.forget(refs) 

2503 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

2504 

2505 def validateConfiguration( 

2506 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

2507 ) -> None: 

2508 """Validate some of the configuration for this datastore. 

2509 

2510 Parameters 

2511 ---------- 

2512 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

2513 Entities to test against this configuration. Can be differing 

2514 types. 

2515 logFailures : `bool`, optional 

2516 If `True`, output a log message for every validation error 

2517 detected. 

2518 

2519 Raises 

2520 ------ 

2521 DatastoreValidationError 

2522 Raised if there is a validation problem with a configuration. 

2523 All the problems are reported in a single exception. 

2524 

2525 Notes 

2526 ----- 

2527 This method checks that all the supplied entities have valid file 

2528 templates and also have formatters defined. 

2529 """ 

2530 

2531 templateFailed = None 

2532 try: 

2533 self.templates.validateTemplates(entities, logFailures=logFailures) 

2534 except FileTemplateValidationError as e: 

2535 templateFailed = str(e) 

2536 

2537 formatterFailed = [] 

2538 for entity in entities: 

2539 try: 

2540 self.formatterFactory.getFormatterClass(entity) 

2541 except KeyError as e: 

2542 formatterFailed.append(str(e)) 

2543 if logFailures: 2543 ↛ 2538line 2543 didn't jump to line 2538, because the condition on line 2543 was never false

2544 log.critical("Formatter failure: %s", e) 

2545 

2546 if templateFailed or formatterFailed: 

2547 messages = [] 

2548 if templateFailed: 2548 ↛ 2549line 2548 didn't jump to line 2549, because the condition on line 2548 was never true

2549 messages.append(templateFailed) 

2550 if formatterFailed: 2550 ↛ 2552line 2550 didn't jump to line 2552, because the condition on line 2550 was never false

2551 messages.append(",".join(formatterFailed)) 

2552 msg = ";\n".join(messages) 

2553 raise DatastoreValidationError(msg) 

2554 

2555 def getLookupKeys(self) -> Set[LookupKey]: 

2556 # Docstring is inherited from base class 

2557 return ( 

2558 self.templates.getLookupKeys() 

2559 | self.formatterFactory.getLookupKeys() 

2560 | self.constraints.getLookupKeys() 

2561 ) 

2562 

2563 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

2564 # Docstring is inherited from base class 

2565 # The key can be valid in either formatters or templates so we can 

2566 # only check the template if it exists 

2567 if lookupKey in self.templates: 

2568 try: 

2569 self.templates[lookupKey].validateTemplate(entity) 

2570 except FileTemplateValidationError as e: 

2571 raise DatastoreValidationError(e) from e 

2572 

2573 def export( 

2574 self, 

2575 refs: Iterable[DatasetRef], 

2576 *, 

2577 directory: Optional[ResourcePathExpression] = None, 

2578 transfer: Optional[str] = "auto", 

2579 ) -> Iterable[FileDataset]: 

2580 # Docstring inherited from Datastore.export. 

2581 if transfer is not None and directory is None: 2581 ↛ 2582line 2581 didn't jump to line 2582, because the condition on line 2581 was never true

2582 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no export directory given") 

2583 

2584 # Force the directory to be a URI object 

2585 directoryUri: Optional[ResourcePath] = None 

2586 if directory is not None: 2586 ↛ 2589line 2586 didn't jump to line 2589, because the condition on line 2586 was never false

2587 directoryUri = ResourcePath(directory, forceDirectory=True) 

2588 

2589 if transfer is not None and directoryUri is not None: 2589 ↛ 2594line 2589 didn't jump to line 2594, because the condition on line 2589 was never false

2590 # mypy needs the second test 

2591 if not directoryUri.exists(): 2591 ↛ 2592line 2591 didn't jump to line 2592, because the condition on line 2591 was never true

2592 raise FileNotFoundError(f"Export location {directory} does not exist") 

2593 

2594 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

2595 for ref in progress.wrap(refs, "Exporting dataset files"): 

2596 fileLocations = self._get_dataset_locations_info(ref) 

2597 if not fileLocations: 2597 ↛ 2598line 2597 didn't jump to line 2598, because the condition on line 2597 was never true

2598 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

2599 # For now we can not export disassembled datasets 

2600 if len(fileLocations) > 1: 

2601 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

2602 location, storedFileInfo = fileLocations[0] 

2603 

2604 pathInStore = location.pathInStore.path 

2605 if transfer is None: 2605 ↛ 2609line 2605 didn't jump to line 2609, because the condition on line 2605 was never true

2606 # TODO: do we also need to return the readStorageClass somehow? 

2607 # We will use the path in store directly. If this is an 

2608 # absolute URI, preserve it. 

2609 if location.pathInStore.isabs(): 

2610 pathInStore = str(location.uri) 

2611 elif transfer == "direct": 2611 ↛ 2613line 2611 didn't jump to line 2613, because the condition on line 2611 was never true

2612 # Use full URIs to the remote store in the export 

2613 pathInStore = str(location.uri) 

2614 else: 

2615 # mypy needs help 

2616 assert directoryUri is not None, "directoryUri must be defined to get here" 

2617 storeUri = ResourcePath(location.uri) 

2618 

2619 # if the datastore has an absolute URI to a resource, we 

2620 # have two options: 

2621 # 1. Keep the absolute URI in the exported YAML 

2622 # 2. Allocate a new name in the local datastore and transfer 

2623 # it. 

2624 # For now go with option 2 

2625 if location.pathInStore.isabs(): 2625 ↛ 2626line 2625 didn't jump to line 2626, because the condition on line 2625 was never true

2626 template = self.templates.getTemplate(ref) 

2627 newURI = ResourcePath(template.format(ref), forceAbsolute=False) 

2628 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

2629 

2630 exportUri = directoryUri.join(pathInStore) 

2631 exportUri.transfer_from(storeUri, transfer=transfer) 

2632 

2633 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

2634 

2635 @staticmethod 

2636 def computeChecksum( 

2637 uri: ResourcePath, algorithm: str = "blake2b", block_size: int = 8192 

2638 ) -> Optional[str]: 

2639 """Compute the checksum of the supplied file. 

2640 

2641 Parameters 

2642 ---------- 

2643 uri : `lsst.resources.ResourcePath` 

2644 Name of resource to calculate checksum from. 

2645 algorithm : `str`, optional 

2646 Name of algorithm to use. Must be one of the algorithms supported 

2647 by :py:class`hashlib`. 

2648 block_size : `int` 

2649 Number of bytes to read from file at one time. 

2650 

2651 Returns 

2652 ------- 

2653 hexdigest : `str` 

2654 Hex digest of the file. 

2655 

2656 Notes 

2657 ----- 

2658 Currently returns None if the URI is for a remote resource. 

2659 """ 

2660 if algorithm not in hashlib.algorithms_guaranteed: 2660 ↛ 2661line 2660 didn't jump to line 2661, because the condition on line 2660 was never true

2661 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

2662 

2663 if not uri.isLocal: 2663 ↛ 2664line 2663 didn't jump to line 2664, because the condition on line 2663 was never true

2664 return None 

2665 

2666 hasher = hashlib.new(algorithm) 

2667 

2668 with uri.as_local() as local_uri: 

2669 with open(local_uri.ospath, "rb") as f: 

2670 for chunk in iter(lambda: f.read(block_size), b""): 

2671 hasher.update(chunk) 

2672 

2673 return hasher.hexdigest() 

2674 

2675 def needs_expanded_data_ids( 

2676 self, 

2677 transfer: Optional[str], 

2678 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

2679 ) -> bool: 

2680 # Docstring inherited. 

2681 # This _could_ also use entity to inspect whether the filename template 

2682 # involves placeholders other than the required dimensions for its 

2683 # dataset type, but that's not necessary for correctness; it just 

2684 # enables more optimizations (perhaps only in theory). 

2685 return transfer not in ("direct", None)