Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreConfig, 

60 DatastoreValidationError, 

61 FileDescriptor, 

62 FileTemplates, 

63 FileTemplateValidationError, 

64 Formatter, 

65 FormatterFactory, 

66 Location, 

67 LocationFactory, 

68 Progress, 

69 StorageClass, 

70 StoredFileInfo, 

71) 

72 

73from lsst.daf.butler import ddl 

74from lsst.daf.butler.registry.interfaces import ( 

75 ReadOnlyDatabaseError, 

76 DatastoreRegistryBridge, 

77) 

78 

79from lsst.daf.butler.core.repoRelocation import replaceRoot 

80from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

81from .genericDatastore import GenericBaseDatastore 

82 

83if TYPE_CHECKING: 83 ↛ 84line 83 didn't jump to line 84, because the condition on line 83 was never true

84 from lsst.daf.butler import LookupKey 

85 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

86 

87log = logging.getLogger(__name__) 

88 

89# String to use when a Python None is encountered 

90NULLSTR = "__NULL_STRING__" 

91 

92 

93class _IngestPrepData(Datastore.IngestPrepData): 

94 """Helper class for FileDatastore ingest implementation. 

95 

96 Parameters 

97 ---------- 

98 datasets : `list` of `FileDataset` 

99 Files to be ingested by this datastore. 

100 """ 

101 def __init__(self, datasets: List[FileDataset]): 

102 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

103 self.datasets = datasets 

104 

105 

106@dataclass(frozen=True) 

107class DatastoreFileGetInformation: 

108 """Collection of useful parameters needed to retrieve a file from 

109 a Datastore. 

110 """ 

111 

112 location: Location 

113 """The location from which to read the dataset.""" 

114 

115 formatter: Formatter 

116 """The `Formatter` to use to deserialize the dataset.""" 

117 

118 info: StoredFileInfo 

119 """Stored information about this file and its formatter.""" 

120 

121 assemblerParams: Dict[str, Any] 

122 """Parameters to use for post-processing the retrieved dataset.""" 

123 

124 formatterParams: Dict[str, Any] 

125 """Parameters that were understood by the associated formatter.""" 

126 

127 component: Optional[str] 

128 """The component to be retrieved (can be `None`).""" 

129 

130 readStorageClass: StorageClass 

131 """The `StorageClass` of the dataset being read.""" 

132 

133 

134class FileDatastore(GenericBaseDatastore): 

135 """Generic Datastore for file-based implementations. 

136 

137 Should always be sub-classed since key abstract methods are missing. 

138 

139 Parameters 

140 ---------- 

141 config : `DatastoreConfig` or `str` 

142 Configuration as either a `Config` object or URI to file. 

143 bridgeManager : `DatastoreRegistryBridgeManager` 

144 Object that manages the interface between `Registry` and datastores. 

145 butlerRoot : `str`, optional 

146 New datastore root to use to override the configuration value. 

147 

148 Raises 

149 ------ 

150 ValueError 

151 If root location does not exist and ``create`` is `False` in the 

152 configuration. 

153 """ 

154 

155 defaultConfigFile: ClassVar[Optional[str]] = None 

156 """Path to configuration defaults. Accessed within the ``config`` resource 

157 or relative to a search path. Can be None if no defaults specified. 

158 """ 

159 

160 root: ButlerURI 

161 """Root directory URI of this `Datastore`.""" 

162 

163 locationFactory: LocationFactory 

164 """Factory for creating locations relative to the datastore root.""" 

165 

166 formatterFactory: FormatterFactory 

167 """Factory for creating instances of formatters.""" 

168 

169 templates: FileTemplates 

170 """File templates that can be used by this `Datastore`.""" 

171 

172 composites: CompositesMap 

173 """Determines whether a dataset should be disassembled on put.""" 

174 

175 defaultConfigFile = "datastores/fileDatastore.yaml" 

176 """Path to configuration defaults. Accessed within the ``config`` resource 

177 or relative to a search path. Can be None if no defaults specified. 

178 """ 

179 

180 @classmethod 

181 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

182 """Set any filesystem-dependent config options for this Datastore to 

183 be appropriate for a new empty repository with the given root. 

184 

185 Parameters 

186 ---------- 

187 root : `str` 

188 URI to the root of the data repository. 

189 config : `Config` 

190 A `Config` to update. Only the subset understood by 

191 this component will be updated. Will not expand 

192 defaults. 

193 full : `Config` 

194 A complete config with all defaults expanded that can be 

195 converted to a `DatastoreConfig`. Read-only and will not be 

196 modified by this method. 

197 Repository-specific options that should not be obtained 

198 from defaults when Butler instances are constructed 

199 should be copied from ``full`` to ``config``. 

200 overwrite : `bool`, optional 

201 If `False`, do not modify a value in ``config`` if the value 

202 already exists. Default is always to overwrite with the provided 

203 ``root``. 

204 

205 Notes 

206 ----- 

207 If a keyword is explicitly defined in the supplied ``config`` it 

208 will not be overridden by this method if ``overwrite`` is `False`. 

209 This allows explicit values set in external configs to be retained. 

210 """ 

211 Config.updateParameters(DatastoreConfig, config, full, 

212 toUpdate={"root": root}, 

213 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

214 

215 @classmethod 

216 def makeTableSpec(cls) -> ddl.TableSpec: 

217 return ddl.TableSpec( 

218 fields=[ 

219 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

220 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

221 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

222 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

223 # Use empty string to indicate no component 

224 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

225 # TODO: should checksum be Base64Bytes instead? 

226 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

227 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

228 ], 

229 unique=frozenset(), 

230 ) 

231 

232 def __init__(self, config: Union[DatastoreConfig, str], 

233 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

234 super().__init__(config, bridgeManager) 

235 if "root" not in self.config: 235 ↛ 236line 235 didn't jump to line 236, because the condition on line 235 was never true

236 raise ValueError("No root directory specified in configuration") 

237 

238 # Name ourselves either using an explicit name or a name 

239 # derived from the (unexpanded) root 

240 if "name" in self.config: 

241 self.name = self.config["name"] 

242 else: 

243 # We use the unexpanded root in the name to indicate that this 

244 # datastore can be moved without having to update registry. 

245 self.name = "{}@{}".format(type(self).__name__, 

246 self.config["root"]) 

247 

248 # Support repository relocation in config 

249 # Existence of self.root is checked in subclass 

250 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

251 forceDirectory=True, forceAbsolute=True) 

252 

253 self.locationFactory = LocationFactory(self.root) 

254 self.formatterFactory = FormatterFactory() 

255 

256 # Now associate formatters with storage classes 

257 self.formatterFactory.registerFormatters(self.config["formatters"], 

258 universe=bridgeManager.universe) 

259 

260 # Read the file naming templates 

261 self.templates = FileTemplates(self.config["templates"], 

262 universe=bridgeManager.universe) 

263 

264 # See if composites should be disassembled 

265 self.composites = CompositesMap(self.config["composites"], 

266 universe=bridgeManager.universe) 

267 

268 tableName = self.config["records", "table"] 

269 try: 

270 # Storage of paths and formatters, keyed by dataset_id 

271 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

272 # Interface to Registry. 

273 self._bridge = bridgeManager.register(self.name) 

274 except ReadOnlyDatabaseError: 

275 # If the database is read only and we just tried and failed to 

276 # create a table, it means someone is trying to create a read-only 

277 # butler client for an empty repo. That should be okay, as long 

278 # as they then try to get any datasets before some other client 

279 # creates the table. Chances are they'rejust validating 

280 # configuration. 

281 pass 

282 

283 # Determine whether checksums should be used - default to False 

284 self.useChecksum = self.config.get("checksum", False) 

285 

286 # Determine whether we can fall back to configuration if a 

287 # requested dataset is not known to registry 

288 self.trustGetRequest = self.config.get("trust_get_request", False) 

289 

290 # Check existence and create directory structure if necessary 

291 if not self.root.exists(): 

292 if "create" not in self.config or not self.config["create"]: 292 ↛ 293line 292 didn't jump to line 293, because the condition on line 292 was never true

293 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

294 try: 

295 self.root.mkdir() 

296 except Exception as e: 

297 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

298 f" Got error: {e}") from e 

299 

300 def __str__(self) -> str: 

301 return str(self.root) 

302 

303 @property 

304 def bridge(self) -> DatastoreRegistryBridge: 

305 return self._bridge 

306 

307 def _artifact_exists(self, location: Location) -> bool: 

308 """Check that an artifact exists in this datastore at the specified 

309 location. 

310 

311 Parameters 

312 ---------- 

313 location : `Location` 

314 Expected location of the artifact associated with this datastore. 

315 

316 Returns 

317 ------- 

318 exists : `bool` 

319 True if the location can be found, false otherwise. 

320 """ 

321 log.debug("Checking if resource exists: %s", location.uri) 

322 return location.uri.exists() 

323 

324 def _delete_artifact(self, location: Location) -> None: 

325 """Delete the artifact from the datastore. 

326 

327 Parameters 

328 ---------- 

329 location : `Location` 

330 Location of the artifact associated with this datastore. 

331 """ 

332 if location.pathInStore.isabs(): 332 ↛ 333line 332 didn't jump to line 333, because the condition on line 332 was never true

333 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

334 log.debug("Deleting file: %s", location.uri) 

335 location.uri.remove() 

336 log.debug("Successfully deleted file: %s", location.uri) 

337 

338 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

339 # Docstring inherited from GenericBaseDatastore 

340 records = [] 

341 for ref, info in zip(refs, infos): 

342 # Component should come from ref and fall back on info 

343 component = ref.datasetType.component() 

344 if component is None and info.component is not None: 344 ↛ 345line 344 didn't jump to line 345, because the condition on line 344 was never true

345 component = info.component 

346 if component is None: 

347 # Use empty string since we want this to be part of the 

348 # primary key. 

349 component = NULLSTR 

350 records.append( 

351 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

352 storage_class=info.storageClass.name, component=component, 

353 checksum=info.checksum, file_size=info.file_size) 

354 ) 

355 self._table.insert(*records) 

356 

357 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

358 # Docstring inherited from GenericBaseDatastore 

359 

360 # Look for the dataset_id -- there might be multiple matches 

361 # if we have disassembled the dataset. 

362 records = list(self._table.fetch(dataset_id=ref.id)) 

363 

364 results = [] 

365 for record in records: 

366 # Convert name of StorageClass to instance 

367 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

368 component = record["component"] if (record["component"] 

369 and record["component"] != NULLSTR) else None 

370 

371 info = StoredFileInfo(formatter=record["formatter"], 

372 path=record["path"], 

373 storageClass=storageClass, 

374 component=component, 

375 checksum=record["checksum"], 

376 file_size=record["file_size"]) 

377 results.append(info) 

378 

379 return results 

380 

381 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]: 

382 """Return all dataset refs associated with the supplied path. 

383 

384 Parameters 

385 ---------- 

386 pathInStore : `ButlerURI` 

387 Path of interest in the data store. 

388 

389 Returns 

390 ------- 

391 ids : `set` of `int` 

392 All `DatasetRef` IDs associated with this path. 

393 """ 

394 records = list(self._table.fetch(path=str(pathInStore))) 

395 ids = {r["dataset_id"] for r in records} 

396 return ids 

397 

398 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

399 # Docstring inherited from GenericBaseDatastore 

400 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

401 

402 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

403 r"""Find all the `Location`\ s of the requested dataset in the 

404 `Datastore` and the associated stored file information. 

405 

406 Parameters 

407 ---------- 

408 ref : `DatasetRef` 

409 Reference to the required `Dataset`. 

410 

411 Returns 

412 ------- 

413 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

414 Location of the dataset within the datastore and 

415 stored information about each file and its formatter. 

416 """ 

417 # Get the file information (this will fail if no file) 

418 records = self.getStoredItemsInfo(ref) 

419 

420 # Use the path to determine the location -- we need to take 

421 # into account absolute URIs in the datastore record 

422 locations: List[Tuple[Location, StoredFileInfo]] = [] 

423 for r in records: 

424 uriInStore = ButlerURI(r.path, forceAbsolute=False) 

425 if uriInStore.isabs(): 425 ↛ 426line 425 didn't jump to line 426, because the condition on line 425 was never true

426 location = Location(None, uriInStore) 

427 else: 

428 location = self.locationFactory.fromPath(r.path) 

429 locations.append((location, r)) 

430 return locations 

431 

432 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

433 """Check that there is only one dataset associated with the 

434 specified artifact. 

435 

436 Parameters 

437 ---------- 

438 ref : `DatasetRef` or `FakeDatasetRef` 

439 Dataset to be removed. 

440 location : `Location` 

441 The location of the artifact to be removed. 

442 

443 Returns 

444 ------- 

445 can_remove : `Bool` 

446 True if the artifact can be safely removed. 

447 """ 

448 # Can't ever delete absolute URIs. 

449 if location.pathInStore.isabs(): 449 ↛ 450line 449 didn't jump to line 450, because the condition on line 449 was never true

450 return False 

451 

452 # Get all entries associated with this path 

453 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

454 if not allRefs: 454 ↛ 455line 454 didn't jump to line 455, because the condition on line 454 was never true

455 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

456 

457 # Remove these refs from all the refs and if there is nothing left 

458 # then we can delete 

459 remainingRefs = allRefs - {ref.id} 

460 

461 if remainingRefs: 

462 return False 

463 return True 

464 

465 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

466 StoredFileInfo]]: 

467 """Predict the location and related file information of the requested 

468 dataset in this datastore. 

469 

470 Parameters 

471 ---------- 

472 ref : `DatasetRef` 

473 Reference to the required `Dataset`. 

474 

475 Returns 

476 ------- 

477 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

478 Expected Location of the dataset within the datastore and 

479 placeholder information about each file and its formatter. 

480 

481 Notes 

482 ----- 

483 Uses the current configuration to determine how we would expect the 

484 datastore files to have been written if we couldn't ask registry. 

485 This is safe so long as there has been no change to datastore 

486 configuration between writing the dataset and wanting to read it. 

487 Will not work for files that have been ingested without using the 

488 standard file template or default formatter. 

489 """ 

490 

491 # If we have a component ref we always need to ask the questions 

492 # of the composite. If the composite is disassembled this routine 

493 # should return all components. If the composite was not 

494 # disassembled the composite is what is stored regardless of 

495 # component request. Note that if the caller has disassembled 

496 # a composite there is no way for this guess to know that 

497 # without trying both the composite and component ref and seeing 

498 # if there is something at the component Location even without 

499 # disassembly being enabled. 

500 if ref.datasetType.isComponent(): 

501 ref = ref.makeCompositeRef() 

502 

503 # See if the ref is a composite that should be disassembled 

504 doDisassembly = self.composites.shouldBeDisassembled(ref) 

505 

506 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

507 

508 if doDisassembly: 

509 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

510 compRef = ref.makeComponentRef(component) 

511 location, formatter = self._determine_put_formatter_location(compRef) 

512 all_info.append((location, formatter, componentStorage, component)) 

513 

514 else: 

515 # Always use the composite ref if no disassembly 

516 location, formatter = self._determine_put_formatter_location(ref) 

517 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

518 

519 # Convert the list of tuples to have StoredFileInfo as second element 

520 return [(location, StoredFileInfo(formatter=formatter, 

521 path=location.pathInStore.path, 

522 storageClass=storageClass, 

523 component=component, 

524 checksum=None, 

525 file_size=-1)) 

526 for location, formatter, storageClass, component in all_info] 

527 

528 def _prepare_for_get(self, ref: DatasetRef, 

529 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

530 """Check parameters for ``get`` and obtain formatter and 

531 location. 

532 

533 Parameters 

534 ---------- 

535 ref : `DatasetRef` 

536 Reference to the required Dataset. 

537 parameters : `dict` 

538 `StorageClass`-specific parameters that specify, for example, 

539 a slice of the dataset to be loaded. 

540 

541 Returns 

542 ------- 

543 getInfo : `list` [`DatastoreFileGetInformation`] 

544 Parameters needed to retrieve each file. 

545 """ 

546 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

547 

548 # Get file metadata and internal metadata 

549 fileLocations = self._get_dataset_locations_info(ref) 

550 if not fileLocations: 

551 if not self.trustGetRequest: 

552 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

553 # Assume the dataset is where we think it should be 

554 fileLocations = self._get_expected_dataset_locations_info(ref) 

555 

556 # The storage class we want to use eventually 

557 refStorageClass = ref.datasetType.storageClass 

558 

559 if len(fileLocations) > 1: 

560 disassembled = True 

561 else: 

562 disassembled = False 

563 

564 # Is this a component request? 

565 refComponent = ref.datasetType.component() 

566 

567 fileGetInfo = [] 

568 for location, storedFileInfo in fileLocations: 

569 

570 # The storage class used to write the file 

571 writeStorageClass = storedFileInfo.storageClass 

572 

573 # If this has been disassembled we need read to match the write 

574 if disassembled: 

575 readStorageClass = writeStorageClass 

576 else: 

577 readStorageClass = refStorageClass 

578 

579 formatter = getInstanceOf(storedFileInfo.formatter, 

580 FileDescriptor(location, readStorageClass=readStorageClass, 

581 storageClass=writeStorageClass, parameters=parameters), 

582 ref.dataId) 

583 

584 formatterParams, notFormatterParams = formatter.segregateParameters() 

585 

586 # Of the remaining parameters, extract the ones supported by 

587 # this StorageClass (for components not all will be handled) 

588 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

589 

590 # The ref itself could be a component if the dataset was 

591 # disassembled by butler, or we disassembled in datastore and 

592 # components came from the datastore records 

593 component = storedFileInfo.component if storedFileInfo.component else refComponent 

594 

595 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

596 assemblerParams, formatterParams, 

597 component, readStorageClass)) 

598 

599 return fileGetInfo 

600 

601 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

602 """Check the arguments for ``put`` and obtain formatter and 

603 location. 

604 

605 Parameters 

606 ---------- 

607 inMemoryDataset : `object` 

608 The dataset to store. 

609 ref : `DatasetRef` 

610 Reference to the associated Dataset. 

611 

612 Returns 

613 ------- 

614 location : `Location` 

615 The location to write the dataset. 

616 formatter : `Formatter` 

617 The `Formatter` to use to write the dataset. 

618 

619 Raises 

620 ------ 

621 TypeError 

622 Supplied object and storage class are inconsistent. 

623 DatasetTypeNotSupportedError 

624 The associated `DatasetType` is not handled by this datastore. 

625 """ 

626 self._validate_put_parameters(inMemoryDataset, ref) 

627 return self._determine_put_formatter_location(ref) 

628 

629 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

630 """Calculate the formatter and output location to use for put. 

631 

632 Parameters 

633 ---------- 

634 ref : `DatasetRef` 

635 Reference to the associated Dataset. 

636 

637 Returns 

638 ------- 

639 location : `Location` 

640 The location to write the dataset. 

641 formatter : `Formatter` 

642 The `Formatter` to use to write the dataset. 

643 """ 

644 # Work out output file name 

645 try: 

646 template = self.templates.getTemplate(ref) 

647 except KeyError as e: 

648 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

649 

650 # Validate the template to protect against filenames from different 

651 # dataIds returning the same and causing overwrite confusion. 

652 template.validateTemplate(ref) 

653 

654 location = self.locationFactory.fromPath(template.format(ref)) 

655 

656 # Get the formatter based on the storage class 

657 storageClass = ref.datasetType.storageClass 

658 try: 

659 formatter = self.formatterFactory.getFormatter(ref, 

660 FileDescriptor(location, 

661 storageClass=storageClass), 

662 ref.dataId) 

663 except KeyError as e: 

664 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

665 f"{self.name}") from e 

666 

667 # Now that we know the formatter, update the location 

668 location = formatter.makeUpdatedLocation(location) 

669 

670 return location, formatter 

671 

672 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

673 # Docstring inherited from base class 

674 if transfer != "auto": 

675 return transfer 

676 

677 # See if the paths are within the datastore or not 

678 inside = [self._pathInStore(d.path) is not None for d in datasets] 

679 

680 if all(inside): 

681 transfer = None 

682 elif not any(inside): 682 ↛ 686line 682 didn't jump to line 686, because the condition on line 682 was never false

683 # Allow ButlerURI to use its own knowledge 

684 transfer = "auto" 

685 else: 

686 raise ValueError("Some datasets are inside the datastore and some are outside." 

687 " Please use an explicit transfer mode and not 'auto'.") 

688 

689 return transfer 

690 

691 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

692 """Return path relative to datastore root 

693 

694 Parameters 

695 ---------- 

696 path : `str` or `ButlerURI` 

697 Path to dataset. Can be absolute URI. If relative assumed to 

698 be relative to the datastore. Returns path in datastore 

699 or raises an exception if the path it outside. 

700 

701 Returns 

702 ------- 

703 inStore : `str` 

704 Path relative to datastore root. Returns `None` if the file is 

705 outside the root. 

706 """ 

707 # Relative path will always be relative to datastore 

708 pathUri = ButlerURI(path, forceAbsolute=False) 

709 return pathUri.relative_to(self.root) 

710 

711 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

712 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

713 """Standardize the path of a to-be-ingested file. 

714 

715 Parameters 

716 ---------- 

717 path : `str` or `ButlerURI` 

718 Path of a file to be ingested. 

719 transfer : `str`, optional 

720 How (and whether) the dataset should be added to the datastore. 

721 See `ingest` for details of transfer modes. 

722 This implementation is provided only so 

723 `NotImplementedError` can be raised if the mode is not supported; 

724 actual transfers are deferred to `_extractIngestInfo`. 

725 

726 Returns 

727 ------- 

728 path : `str` or `ButlerURI` 

729 New path in what the datastore considers standard form. If an 

730 absolute URI was given that will be returned unchanged. 

731 

732 Notes 

733 ----- 

734 Subclasses of `FileDatastore` can implement this method instead 

735 of `_prepIngest`. It should not modify the data repository or given 

736 file in any way. 

737 

738 Raises 

739 ------ 

740 NotImplementedError 

741 Raised if the datastore does not support the given transfer mode 

742 (including the case where ingest is not supported at all). 

743 FileNotFoundError 

744 Raised if one of the given files does not exist. 

745 """ 

746 if transfer not in (None, "direct") + self.root.transferModes: 746 ↛ 747line 746 didn't jump to line 747, because the condition on line 746 was never true

747 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

748 

749 # A relative URI indicates relative to datastore root 

750 srcUri = ButlerURI(path, forceAbsolute=False) 

751 if not srcUri.isabs(): 

752 srcUri = self.root.join(path) 

753 

754 if not srcUri.exists(): 

755 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

756 f"are assumed to be relative to {self.root} unless they are absolute.") 

757 

758 if transfer is None: 

759 relpath = srcUri.relative_to(self.root) 

760 if not relpath: 

761 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

762 f"within datastore ({self.root})") 

763 

764 # Return the relative path within the datastore for internal 

765 # transfer 

766 path = relpath 

767 

768 return path 

769 

770 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

771 formatter: Union[Formatter, Type[Formatter]], 

772 transfer: Optional[str] = None) -> StoredFileInfo: 

773 """Relocate (if necessary) and extract `StoredFileInfo` from a 

774 to-be-ingested file. 

775 

776 Parameters 

777 ---------- 

778 path : `str` or `ButlerURI` 

779 URI or path of a file to be ingested. 

780 ref : `DatasetRef` 

781 Reference for the dataset being ingested. Guaranteed to have 

782 ``dataset_id not None`. 

783 formatter : `type` or `Formatter` 

784 `Formatter` subclass to use for this dataset or an instance. 

785 transfer : `str`, optional 

786 How (and whether) the dataset should be added to the datastore. 

787 See `ingest` for details of transfer modes. 

788 

789 Returns 

790 ------- 

791 info : `StoredFileInfo` 

792 Internal datastore record for this file. This will be inserted by 

793 the caller; the `_extractIngestInfo` is only resposible for 

794 creating and populating the struct. 

795 

796 Raises 

797 ------ 

798 FileNotFoundError 

799 Raised if one of the given files does not exist. 

800 FileExistsError 

801 Raised if transfer is not `None` but the (internal) location the 

802 file would be moved to is already occupied. 

803 """ 

804 if self._transaction is None: 804 ↛ 805line 804 didn't jump to line 805, because the condition on line 804 was never true

805 raise RuntimeError("Ingest called without transaction enabled") 

806 

807 # Create URI of the source path, do not need to force a relative 

808 # path to absolute. 

809 srcUri = ButlerURI(path, forceAbsolute=False) 

810 

811 # Track whether we have read the size of the source yet 

812 have_sized = False 

813 

814 tgtLocation: Optional[Location] 

815 if transfer is None: 

816 # A relative path is assumed to be relative to the datastore 

817 # in this context 

818 if not srcUri.isabs(): 

819 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

820 else: 

821 # Work out the path in the datastore from an absolute URI 

822 # This is required to be within the datastore. 

823 pathInStore = srcUri.relative_to(self.root) 

824 if pathInStore is None: 824 ↛ 825line 824 didn't jump to line 825, because the condition on line 824 was never true

825 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

826 f"not within datastore {self.root}") 

827 tgtLocation = self.locationFactory.fromPath(pathInStore) 

828 elif transfer == "direct": 828 ↛ 833line 828 didn't jump to line 833, because the condition on line 828 was never true

829 # Want to store the full URI to the resource directly in 

830 # datastore. This is useful for referring to permanent archive 

831 # storage for raw data. 

832 # Trust that people know what they are doing. 

833 tgtLocation = None 

834 else: 

835 # Work out the name we want this ingested file to have 

836 # inside the datastore 

837 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

838 if not tgtLocation.uri.dirname().exists(): 

839 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

840 tgtLocation.uri.dirname().mkdir() 

841 

842 # if we are transferring from a local file to a remote location 

843 # it may be more efficient to get the size and checksum of the 

844 # local file rather than the transferred one 

845 if not srcUri.scheme or srcUri.scheme == "file": 845 ↛ 851line 845 didn't jump to line 851, because the condition on line 845 was never false

846 size = srcUri.size() 

847 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

848 have_sized = True 

849 

850 # transfer the resource to the destination 

851 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

852 

853 if tgtLocation is None: 853 ↛ 855line 853 didn't jump to line 855, because the condition on line 853 was never true

854 # This means we are using direct mode 

855 targetUri = srcUri 

856 targetPath = str(srcUri) 

857 else: 

858 targetUri = tgtLocation.uri 

859 targetPath = tgtLocation.pathInStore.path 

860 

861 # the file should exist in the datastore now 

862 if not have_sized: 

863 size = targetUri.size() 

864 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

865 

866 return StoredFileInfo(formatter=formatter, path=targetPath, 

867 storageClass=ref.datasetType.storageClass, 

868 component=ref.datasetType.component(), 

869 file_size=size, checksum=checksum) 

870 

871 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

872 # Docstring inherited from Datastore._prepIngest. 

873 filtered = [] 

874 for dataset in datasets: 

875 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

876 if not acceptable: 

877 continue 

878 else: 

879 dataset.refs = acceptable 

880 if dataset.formatter is None: 

881 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

882 else: 

883 assert isinstance(dataset.formatter, (type, str)) 

884 dataset.formatter = getClassOf(dataset.formatter) 

885 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

886 filtered.append(dataset) 

887 return _IngestPrepData(filtered) 

888 

889 @transactional 

890 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

891 # Docstring inherited from Datastore._finishIngest. 

892 refsAndInfos = [] 

893 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

894 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

895 # Do ingest as if the first dataset ref is associated with the file 

896 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

897 transfer=transfer) 

898 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

899 self._register_datasets(refsAndInfos) 

900 

901 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

902 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

903 """Given a source URI and a DatasetRef, determine the name the 

904 dataset will have inside datastore. 

905 

906 Parameters 

907 ---------- 

908 srcUri : `ButlerURI` 

909 URI to the source dataset file. 

910 ref : `DatasetRef` 

911 Ref associated with the newly-ingested dataset artifact. This 

912 is used to determine the name within the datastore. 

913 formatter : `Formatter` or Formatter class. 

914 Formatter to use for validation. Can be a class or an instance. 

915 

916 Returns 

917 ------- 

918 location : `Location` 

919 Target location for the newly-ingested dataset. 

920 """ 

921 # Ingesting a file from outside the datastore. 

922 # This involves a new name. 

923 template = self.templates.getTemplate(ref) 

924 location = self.locationFactory.fromPath(template.format(ref)) 

925 

926 # Get the extension 

927 ext = srcUri.getExtension() 

928 

929 # Update the destination to include that extension 

930 location.updateExtension(ext) 

931 

932 # Ask the formatter to validate this extension 

933 formatter.validateExtension(location) 

934 

935 return location 

936 

937 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

938 """Write out in memory dataset to datastore. 

939 

940 Parameters 

941 ---------- 

942 inMemoryDataset : `object` 

943 Dataset to write to datastore. 

944 ref : `DatasetRef` 

945 Registry information associated with this dataset. 

946 

947 Returns 

948 ------- 

949 info : `StoredFileInfo` 

950 Information describin the artifact written to the datastore. 

951 """ 

952 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

953 uri = location.uri 

954 

955 if not uri.dirname().exists(): 

956 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

957 uri.dirname().mkdir() 

958 

959 if self._transaction is None: 959 ↛ 960line 959 didn't jump to line 960, because the condition on line 959 was never true

960 raise RuntimeError("Attempting to write artifact without transaction enabled") 

961 

962 def _removeFileExists(uri: ButlerURI) -> None: 

963 """Remove a file and do not complain if it is not there. 

964 

965 This is important since a formatter might fail before the file 

966 is written and we should not confuse people by writing spurious 

967 error messages to the log. 

968 """ 

969 try: 

970 uri.remove() 

971 except FileNotFoundError: 

972 pass 

973 

974 # Register a callback to try to delete the uploaded data if 

975 # something fails below 

976 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

977 

978 # For a local file, simply use the formatter directly 

979 if uri.isLocal: 

980 formatter.write(inMemoryDataset) 

981 log.debug("Successfully wrote python object to local file at %s", uri) 

982 else: 

983 # This is a remote URI, so first try bytes and write directly else 

984 # fallback to a temporary file 

985 try: 

986 serializedDataset = formatter.toBytes(inMemoryDataset) 

987 log.debug("Writing bytes directly to %s", uri) 

988 uri.write(serializedDataset, overwrite=True) 

989 log.debug("Successfully wrote bytes directly to %s", uri) 

990 except NotImplementedError: 

991 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

992 # Need to configure the formatter to write to a different 

993 # location and that needs us to overwrite internals 

994 tmpLocation = Location(*os.path.split(tmpFile.name)) 

995 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

996 with formatter._updateLocation(tmpLocation): 

997 formatter.write(inMemoryDataset) 

998 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

999 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1000 

1001 # URI is needed to resolve what ingest case are we dealing with 

1002 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1003 

1004 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1005 ref: DatasetRef, isComponent: bool = False) -> Any: 

1006 """Read the artifact from datastore into in memory object. 

1007 

1008 Parameters 

1009 ---------- 

1010 getInfo : `DatastoreFileGetInformation` 

1011 Information about the artifact within the datastore. 

1012 ref : `DatasetRef` 

1013 The registry information associated with this artifact. 

1014 isComponent : `bool` 

1015 Flag to indicate if a component is being read from this artifact. 

1016 

1017 Returns 

1018 ------- 

1019 inMemoryDataset : `object` 

1020 The artifact as a python object. 

1021 """ 

1022 location = getInfo.location 

1023 uri = location.uri 

1024 log.debug("Accessing data from %s", uri) 

1025 

1026 # Cannot recalculate checksum but can compare size as a quick check 

1027 # Do not do this if the size is negative since that indicates 

1028 # we do not know. 

1029 recorded_size = getInfo.info.file_size 

1030 resource_size = uri.size() 

1031 if recorded_size >= 0 and resource_size != recorded_size: 1031 ↛ 1032line 1031 didn't jump to line 1032, because the condition on line 1031 was never true

1032 raise RuntimeError("Integrity failure in Datastore. " 

1033 f"Size of file {uri} ({resource_size}) " 

1034 f"does not match size recorded in registry of {recorded_size}") 

1035 

1036 # For the general case we have choices for how to proceed. 

1037 # 1. Always use a local file (downloading the remote resource to a 

1038 # temporary file if needed). 

1039 # 2. Use a threshold size and read into memory and use bytes. 

1040 # Use both for now with an arbitrary hand off size. 

1041 # This allows small datasets to be downloaded from remote object 

1042 # stores without requiring a temporary file. 

1043 

1044 formatter = getInfo.formatter 

1045 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1046 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1047 serializedDataset = uri.read() 

1048 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1049 f"component {getInfo.component}" if isComponent else "", 

1050 len(serializedDataset), uri, formatter.name()) 

1051 try: 

1052 result = formatter.fromBytes(serializedDataset, 

1053 component=getInfo.component if isComponent else None) 

1054 except Exception as e: 

1055 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1056 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1057 else: 

1058 # Read from file 

1059 with uri.as_local() as local_uri: 

1060 # Have to update the Location associated with the formatter 

1061 # because formatter.read does not allow an override. 

1062 # This could be improved. 

1063 msg = "" 

1064 newLocation = None 

1065 if uri != local_uri: 

1066 newLocation = Location(*local_uri.split()) 

1067 msg = "(via download to local file)" 

1068 

1069 log.debug("Reading %s from location %s %s with formatter %s", 

1070 f"component {getInfo.component}" if isComponent else "", 

1071 uri, msg, formatter.name()) 

1072 try: 

1073 with formatter._updateLocation(newLocation): 

1074 result = formatter.read(component=getInfo.component if isComponent else None) 

1075 except Exception as e: 

1076 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1077 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1078 

1079 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1080 isComponent=isComponent) 

1081 

1082 def exists(self, ref: DatasetRef) -> bool: 

1083 """Check if the dataset exists in the datastore. 

1084 

1085 Parameters 

1086 ---------- 

1087 ref : `DatasetRef` 

1088 Reference to the required dataset. 

1089 

1090 Returns 

1091 ------- 

1092 exists : `bool` 

1093 `True` if the entity exists in the `Datastore`. 

1094 """ 

1095 fileLocations = self._get_dataset_locations_info(ref) 

1096 

1097 # if we are being asked to trust that registry might not be correct 

1098 # we ask for the expected locations and check them explicitly 

1099 if not fileLocations: 

1100 if not self.trustGetRequest: 

1101 return False 

1102 fileLocations = self._get_expected_dataset_locations_info(ref) 

1103 for location, _ in fileLocations: 

1104 if not self._artifact_exists(location): 

1105 return False 

1106 

1107 return True 

1108 

1109 def getURIs(self, ref: DatasetRef, 

1110 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1111 """Return URIs associated with dataset. 

1112 

1113 Parameters 

1114 ---------- 

1115 ref : `DatasetRef` 

1116 Reference to the required dataset. 

1117 predict : `bool`, optional 

1118 If the datastore does not know about the dataset, should it 

1119 return a predicted URI or not? 

1120 

1121 Returns 

1122 ------- 

1123 primary : `ButlerURI` 

1124 The URI to the primary artifact associated with this dataset. 

1125 If the dataset was disassembled within the datastore this 

1126 may be `None`. 

1127 components : `dict` 

1128 URIs to any components associated with the dataset artifact. 

1129 Can be empty if there are no components. 

1130 """ 

1131 

1132 primary: Optional[ButlerURI] = None 

1133 components: Dict[str, ButlerURI] = {} 

1134 

1135 # if this has never been written then we have to guess 

1136 if not self.exists(ref): 

1137 if not predict: 

1138 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1139 

1140 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1141 

1142 if doDisassembly: 

1143 

1144 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1145 compRef = ref.makeComponentRef(component) 

1146 compLocation, _ = self._determine_put_formatter_location(compRef) 

1147 

1148 # Add a URI fragment to indicate this is a guess 

1149 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1150 

1151 else: 

1152 

1153 location, _ = self._determine_put_formatter_location(ref) 

1154 

1155 # Add a URI fragment to indicate this is a guess 

1156 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1157 

1158 return primary, components 

1159 

1160 # If this is a ref that we have written we can get the path. 

1161 # Get file metadata and internal metadata 

1162 fileLocations = self._get_dataset_locations_info(ref) 

1163 

1164 guessing = False 

1165 if not fileLocations: 

1166 if not self.trustGetRequest: 1166 ↛ 1167line 1166 didn't jump to line 1167, because the condition on line 1166 was never true

1167 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1168 fileLocations = self._get_expected_dataset_locations_info(ref) 

1169 guessing = True 

1170 

1171 if len(fileLocations) == 1: 

1172 # No disassembly so this is the primary URI 

1173 uri = fileLocations[0][0].uri 

1174 if guessing and not uri.exists(): 1174 ↛ 1175line 1174 didn't jump to line 1175, because the condition on line 1174 was never true

1175 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1176 primary = uri 

1177 

1178 else: 

1179 for location, storedFileInfo in fileLocations: 

1180 if storedFileInfo.component is None: 1180 ↛ 1181line 1180 didn't jump to line 1181, because the condition on line 1180 was never true

1181 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1182 uri = location.uri 

1183 if guessing and not uri.exists(): 1183 ↛ 1184line 1183 didn't jump to line 1184, because the condition on line 1183 was never true

1184 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1185 components[storedFileInfo.component] = uri 

1186 

1187 return primary, components 

1188 

1189 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1190 """URI to the Dataset. 

1191 

1192 Parameters 

1193 ---------- 

1194 ref : `DatasetRef` 

1195 Reference to the required Dataset. 

1196 predict : `bool` 

1197 If `True`, allow URIs to be returned of datasets that have not 

1198 been written. 

1199 

1200 Returns 

1201 ------- 

1202 uri : `str` 

1203 URI pointing to the dataset within the datastore. If the 

1204 dataset does not exist in the datastore, and if ``predict`` is 

1205 `True`, the URI will be a prediction and will include a URI 

1206 fragment "#predicted". 

1207 If the datastore does not have entities that relate well 

1208 to the concept of a URI the returned URI will be 

1209 descriptive. The returned URI is not guaranteed to be obtainable. 

1210 

1211 Raises 

1212 ------ 

1213 FileNotFoundError 

1214 Raised if a URI has been requested for a dataset that does not 

1215 exist and guessing is not allowed. 

1216 RuntimeError 

1217 Raised if a request is made for a single URI but multiple URIs 

1218 are associated with this dataset. 

1219 

1220 Notes 

1221 ----- 

1222 When a predicted URI is requested an attempt will be made to form 

1223 a reasonable URI based on file templates and the expected formatter. 

1224 """ 

1225 primary, components = self.getURIs(ref, predict) 

1226 if primary is None or components: 1226 ↛ 1227line 1226 didn't jump to line 1227, because the condition on line 1226 was never true

1227 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1228 "Use Dataastore.getURIs() instead.") 

1229 return primary 

1230 

1231 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1232 """Load an InMemoryDataset from the store. 

1233 

1234 Parameters 

1235 ---------- 

1236 ref : `DatasetRef` 

1237 Reference to the required Dataset. 

1238 parameters : `dict` 

1239 `StorageClass`-specific parameters that specify, for example, 

1240 a slice of the dataset to be loaded. 

1241 

1242 Returns 

1243 ------- 

1244 inMemoryDataset : `object` 

1245 Requested dataset or slice thereof as an InMemoryDataset. 

1246 

1247 Raises 

1248 ------ 

1249 FileNotFoundError 

1250 Requested dataset can not be retrieved. 

1251 TypeError 

1252 Return value from formatter has unexpected type. 

1253 ValueError 

1254 Formatter failed to process the dataset. 

1255 """ 

1256 allGetInfo = self._prepare_for_get(ref, parameters) 

1257 refComponent = ref.datasetType.component() 

1258 

1259 # Supplied storage class for the component being read 

1260 refStorageClass = ref.datasetType.storageClass 

1261 

1262 # Create mapping from component name to related info 

1263 allComponents = {i.component: i for i in allGetInfo} 

1264 

1265 # By definition the dataset is disassembled if we have more 

1266 # than one record for it. 

1267 isDisassembled = len(allGetInfo) > 1 

1268 

1269 # Look for the special case where we are disassembled but the 

1270 # component is a derived component that was not written during 

1271 # disassembly. For this scenario we need to check that the 

1272 # component requested is listed as a derived component for the 

1273 # composite storage class 

1274 isDisassembledReadOnlyComponent = False 

1275 if isDisassembled and refComponent: 

1276 # The composite storage class should be accessible through 

1277 # the component dataset type 

1278 compositeStorageClass = ref.datasetType.parentStorageClass 

1279 

1280 # In the unlikely scenario where the composite storage 

1281 # class is not known, we can only assume that this is a 

1282 # normal component. If that assumption is wrong then the 

1283 # branch below that reads a persisted component will fail 

1284 # so there is no need to complain here. 

1285 if compositeStorageClass is not None: 1285 ↛ 1288line 1285 didn't jump to line 1288, because the condition on line 1285 was never false

1286 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1287 

1288 if isDisassembled and not refComponent: 

1289 # This was a disassembled dataset spread over multiple files 

1290 # and we need to put them all back together again. 

1291 # Read into memory and then assemble 

1292 

1293 # Check that the supplied parameters are suitable for the type read 

1294 refStorageClass.validateParameters(parameters) 

1295 

1296 # We want to keep track of all the parameters that were not used 

1297 # by formatters. We assume that if any of the component formatters 

1298 # use a parameter that we do not need to apply it again in the 

1299 # assembler. 

1300 usedParams = set() 

1301 

1302 components: Dict[str, Any] = {} 

1303 for getInfo in allGetInfo: 

1304 # assemblerParams are parameters not understood by the 

1305 # associated formatter. 

1306 usedParams.update(set(getInfo.formatterParams)) 

1307 

1308 component = getInfo.component 

1309 

1310 if component is None: 1310 ↛ 1311line 1310 didn't jump to line 1311, because the condition on line 1310 was never true

1311 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1312 

1313 # We do not want the formatter to think it's reading 

1314 # a component though because it is really reading a 

1315 # standalone dataset -- always tell reader it is not a 

1316 # component. 

1317 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1318 

1319 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1320 

1321 # Any unused parameters will have to be passed to the assembler 

1322 if parameters: 

1323 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1324 else: 

1325 unusedParams = {} 

1326 

1327 # Process parameters 

1328 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1329 parameters=unusedParams) 

1330 

1331 elif isDisassembledReadOnlyComponent: 

1332 

1333 compositeStorageClass = ref.datasetType.parentStorageClass 

1334 if compositeStorageClass is None: 1334 ↛ 1335line 1334 didn't jump to line 1335, because the condition on line 1334 was never true

1335 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1336 "no composite storage class is available.") 

1337 

1338 if refComponent is None: 1338 ↛ 1340line 1338 didn't jump to line 1340, because the condition on line 1338 was never true

1339 # Mainly for mypy 

1340 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1341 

1342 # Assume that every derived component can be calculated by 

1343 # forwarding the request to a single read/write component. 

1344 # Rather than guessing which rw component is the right one by 

1345 # scanning each for a derived component of the same name, 

1346 # we ask the storage class delegate directly which one is best to 

1347 # use. 

1348 compositeDelegate = compositeStorageClass.delegate() 

1349 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1350 set(allComponents)) 

1351 

1352 # Select the relevant component 

1353 rwInfo = allComponents[forwardedComponent] 

1354 

1355 # For now assume that read parameters are validated against 

1356 # the real component and not the requested component 

1357 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1358 forwardedStorageClass.validateParameters(parameters) 

1359 

1360 # Unfortunately the FileDescriptor inside the formatter will have 

1361 # the wrong write storage class so we need to create a new one 

1362 # given the immutability constraint. 

1363 writeStorageClass = rwInfo.info.storageClass 

1364 

1365 # We may need to put some thought into parameters for read 

1366 # components but for now forward them on as is 

1367 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1368 readStorageClass=refStorageClass, 

1369 storageClass=writeStorageClass, 

1370 parameters=parameters), 

1371 ref.dataId) 

1372 

1373 # The assembler can not receive any parameter requests for a 

1374 # derived component at this time since the assembler will 

1375 # see the storage class of the derived component and those 

1376 # parameters will have to be handled by the formatter on the 

1377 # forwarded storage class. 

1378 assemblerParams: Dict[str, Any] = {} 

1379 

1380 # Need to created a new info that specifies the derived 

1381 # component and associated storage class 

1382 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1383 rwInfo.info, assemblerParams, {}, 

1384 refComponent, refStorageClass) 

1385 

1386 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1387 

1388 else: 

1389 # Single file request or component from that composite file 

1390 for lookup in (refComponent, None): 1390 ↛ 1395line 1390 didn't jump to line 1395, because the loop on line 1390 didn't complete

1391 if lookup in allComponents: 1391 ↛ 1390line 1391 didn't jump to line 1390, because the condition on line 1391 was never false

1392 getInfo = allComponents[lookup] 

1393 break 

1394 else: 

1395 raise FileNotFoundError(f"Component {refComponent} not found " 

1396 f"for ref {ref} in datastore {self.name}") 

1397 

1398 # Do not need the component itself if already disassembled 

1399 if isDisassembled: 

1400 isComponent = False 

1401 else: 

1402 isComponent = getInfo.component is not None 

1403 

1404 # For a disassembled component we can validate parametersagainst 

1405 # the component storage class directly 

1406 if isDisassembled: 

1407 refStorageClass.validateParameters(parameters) 

1408 else: 

1409 # For an assembled composite this could be a derived 

1410 # component derived from a real component. The validity 

1411 # of the parameters is not clear. For now validate against 

1412 # the composite storage class 

1413 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1414 

1415 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1416 

1417 @transactional 

1418 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1419 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1420 

1421 Parameters 

1422 ---------- 

1423 inMemoryDataset : `object` 

1424 The dataset to store. 

1425 ref : `DatasetRef` 

1426 Reference to the associated Dataset. 

1427 

1428 Raises 

1429 ------ 

1430 TypeError 

1431 Supplied object and storage class are inconsistent. 

1432 DatasetTypeNotSupportedError 

1433 The associated `DatasetType` is not handled by this datastore. 

1434 

1435 Notes 

1436 ----- 

1437 If the datastore is configured to reject certain dataset types it 

1438 is possible that the put will fail and raise a 

1439 `DatasetTypeNotSupportedError`. The main use case for this is to 

1440 allow `ChainedDatastore` to put to multiple datastores without 

1441 requiring that every datastore accepts the dataset. 

1442 """ 

1443 

1444 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1445 # doDisassembly = True 

1446 

1447 artifacts = [] 

1448 if doDisassembly: 

1449 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1450 for component, componentInfo in components.items(): 

1451 # Don't recurse because we want to take advantage of 

1452 # bulk insert -- need a new DatasetRef that refers to the 

1453 # same dataset_id but has the component DatasetType 

1454 # DatasetType does not refer to the types of components 

1455 # So we construct one ourselves. 

1456 compRef = ref.makeComponentRef(component) 

1457 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1458 artifacts.append((compRef, storedInfo)) 

1459 else: 

1460 # Write the entire thing out 

1461 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1462 artifacts.append((ref, storedInfo)) 

1463 

1464 self._register_datasets(artifacts) 

1465 

1466 @transactional 

1467 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1468 """Indicate to the datastore that a dataset can be removed. 

1469 

1470 Parameters 

1471 ---------- 

1472 ref : `DatasetRef` 

1473 Reference to the required Dataset. 

1474 ignore_errors : `bool` 

1475 If `True` return without error even if something went wrong. 

1476 Problems could occur if another process is simultaneously trying 

1477 to delete. 

1478 

1479 Raises 

1480 ------ 

1481 FileNotFoundError 

1482 Attempt to remove a dataset that does not exist. 

1483 """ 

1484 # Get file metadata and internal metadata 

1485 log.debug("Trashing %s in datastore %s", ref, self.name) 

1486 

1487 fileLocations = self._get_dataset_locations_info(ref) 

1488 

1489 if not fileLocations: 

1490 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1491 if ignore_errors: 

1492 log.warning(err_msg) 

1493 return 

1494 else: 

1495 raise FileNotFoundError(err_msg) 

1496 

1497 for location, storedFileInfo in fileLocations: 

1498 if not self._artifact_exists(location): 1498 ↛ 1499line 1498 didn't jump to line 1499, because the condition on line 1498 was never true

1499 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1500 f"associated artifact ({location.uri}) is missing" 

1501 if ignore_errors: 

1502 log.warning(err_msg) 

1503 return 

1504 else: 

1505 raise FileNotFoundError(err_msg) 

1506 

1507 # Mark dataset as trashed 

1508 try: 

1509 self._move_to_trash_in_registry(ref) 

1510 except Exception as e: 

1511 if ignore_errors: 

1512 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1513 f"but encountered an error: {e}") 

1514 pass 

1515 else: 

1516 raise 

1517 

1518 @transactional 

1519 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1520 """Remove all datasets from the trash. 

1521 

1522 Parameters 

1523 ---------- 

1524 ignore_errors : `bool` 

1525 If `True` return without error even if something went wrong. 

1526 Problems could occur if another process is simultaneously trying 

1527 to delete. 

1528 """ 

1529 log.debug("Emptying trash in datastore %s", self.name) 

1530 # Context manager will empty trash iff we finish it without raising. 

1531 with self.bridge.emptyTrash() as trashed: 

1532 for ref in trashed: 

1533 fileLocations = self._get_dataset_locations_info(ref) 

1534 

1535 if not fileLocations: 1535 ↛ 1536line 1535 didn't jump to line 1536, because the condition on line 1535 was never true

1536 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1537 if ignore_errors: 

1538 log.warning(err_msg) 

1539 continue 

1540 else: 

1541 raise FileNotFoundError(err_msg) 

1542 

1543 for location, _ in fileLocations: 

1544 

1545 if not self._artifact_exists(location): 1545 ↛ 1546line 1545 didn't jump to line 1546, because the condition on line 1545 was never true

1546 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1547 if ignore_errors: 

1548 log.warning(err_msg) 

1549 continue 

1550 else: 

1551 raise FileNotFoundError(err_msg) 

1552 

1553 # Can only delete the artifact if there are no references 

1554 # to the file from untrashed dataset refs. 

1555 if self._can_remove_dataset_artifact(ref, location): 

1556 # Point of no return for this artifact 

1557 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1558 try: 

1559 self._delete_artifact(location) 

1560 except Exception as e: 

1561 if ignore_errors: 

1562 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1563 location.uri, self.name, e) 

1564 else: 

1565 raise 

1566 

1567 # Now must remove the entry from the internal registry even if 

1568 # the artifact removal failed and was ignored, 

1569 # otherwise the removal check above will never be true 

1570 try: 

1571 # There may be multiple rows associated with this ref 

1572 # depending on disassembly 

1573 self.removeStoredItemInfo(ref) 

1574 except Exception as e: 

1575 if ignore_errors: 

1576 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1577 ref.id, location.uri, self.name, e) 

1578 continue 

1579 else: 

1580 raise FileNotFoundError( 

1581 f"Error removing dataset {ref.id} ({location.uri}) from internal registry " 

1582 f"of {self.name}" 

1583 ) from e 

1584 

1585 @transactional 

1586 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1587 # Docstring inherited. 

1588 refs = list(refs) 

1589 self.bridge.forget(refs) 

1590 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1591 

1592 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1593 logFailures: bool = False) -> None: 

1594 """Validate some of the configuration for this datastore. 

1595 

1596 Parameters 

1597 ---------- 

1598 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1599 Entities to test against this configuration. Can be differing 

1600 types. 

1601 logFailures : `bool`, optional 

1602 If `True`, output a log message for every validation error 

1603 detected. 

1604 

1605 Raises 

1606 ------ 

1607 DatastoreValidationError 

1608 Raised if there is a validation problem with a configuration. 

1609 All the problems are reported in a single exception. 

1610 

1611 Notes 

1612 ----- 

1613 This method checks that all the supplied entities have valid file 

1614 templates and also have formatters defined. 

1615 """ 

1616 

1617 templateFailed = None 

1618 try: 

1619 self.templates.validateTemplates(entities, logFailures=logFailures) 

1620 except FileTemplateValidationError as e: 

1621 templateFailed = str(e) 

1622 

1623 formatterFailed = [] 

1624 for entity in entities: 

1625 try: 

1626 self.formatterFactory.getFormatterClass(entity) 

1627 except KeyError as e: 

1628 formatterFailed.append(str(e)) 

1629 if logFailures: 1629 ↛ 1624line 1629 didn't jump to line 1624, because the condition on line 1629 was never false

1630 log.critical("Formatter failure: %s", e) 

1631 

1632 if templateFailed or formatterFailed: 

1633 messages = [] 

1634 if templateFailed: 1634 ↛ 1635line 1634 didn't jump to line 1635, because the condition on line 1634 was never true

1635 messages.append(templateFailed) 

1636 if formatterFailed: 1636 ↛ 1638line 1636 didn't jump to line 1638, because the condition on line 1636 was never false

1637 messages.append(",".join(formatterFailed)) 

1638 msg = ";\n".join(messages) 

1639 raise DatastoreValidationError(msg) 

1640 

1641 def getLookupKeys(self) -> Set[LookupKey]: 

1642 # Docstring is inherited from base class 

1643 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1644 self.constraints.getLookupKeys() 

1645 

1646 def validateKey(self, lookupKey: LookupKey, 

1647 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1648 # Docstring is inherited from base class 

1649 # The key can be valid in either formatters or templates so we can 

1650 # only check the template if it exists 

1651 if lookupKey in self.templates: 

1652 try: 

1653 self.templates[lookupKey].validateTemplate(entity) 

1654 except FileTemplateValidationError as e: 

1655 raise DatastoreValidationError(e) from e 

1656 

1657 def export(self, refs: Iterable[DatasetRef], *, 

1658 directory: Optional[Union[ButlerURI, str]] = None, 

1659 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1660 # Docstring inherited from Datastore.export. 

1661 if transfer is not None and directory is None: 1661 ↛ 1662line 1661 didn't jump to line 1662, because the condition on line 1661 was never true

1662 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1663 "export directory given") 

1664 

1665 # Force the directory to be a URI object 

1666 directoryUri: Optional[ButlerURI] = None 

1667 if directory is not None: 1667 ↛ 1670line 1667 didn't jump to line 1670, because the condition on line 1667 was never false

1668 directoryUri = ButlerURI(directory, forceDirectory=True) 

1669 

1670 if transfer is not None and directoryUri is not None: 1670 ↛ 1675line 1670 didn't jump to line 1675, because the condition on line 1670 was never false

1671 # mypy needs the second test 

1672 if not directoryUri.exists(): 1672 ↛ 1673line 1672 didn't jump to line 1673, because the condition on line 1672 was never true

1673 raise FileNotFoundError(f"Export location {directory} does not exist") 

1674 

1675 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

1676 for ref in progress.wrap(refs, "Exporting dataset files"): 

1677 fileLocations = self._get_dataset_locations_info(ref) 

1678 if not fileLocations: 1678 ↛ 1679line 1678 didn't jump to line 1679, because the condition on line 1678 was never true

1679 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1680 # For now we can not export disassembled datasets 

1681 if len(fileLocations) > 1: 

1682 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1683 location, storedFileInfo = fileLocations[0] 

1684 

1685 pathInStore = location.pathInStore.path 

1686 if transfer is None: 1686 ↛ 1689line 1686 didn't jump to line 1689, because the condition on line 1686 was never true

1687 # TODO: do we also need to return the readStorageClass somehow? 

1688 # We will use the path in store directly 

1689 pass 

1690 elif transfer == "direct": 1690 ↛ 1692line 1690 didn't jump to line 1692, because the condition on line 1690 was never true

1691 # Use full URIs to the remote store in the export 

1692 pathInStore = str(location.uri) 

1693 else: 

1694 # mypy needs help 

1695 assert directoryUri is not None, "directoryUri must be defined to get here" 

1696 storeUri = ButlerURI(location.uri) 

1697 

1698 # if the datastore has an absolute URI to a resource, we 

1699 # have two options: 

1700 # 1. Keep the absolute URI in the exported YAML 

1701 # 2. Allocate a new name in the local datastore and transfer 

1702 # it. 

1703 # For now go with option 2 

1704 if location.pathInStore.isabs(): 1704 ↛ 1705line 1704 didn't jump to line 1705, because the condition on line 1704 was never true

1705 template = self.templates.getTemplate(ref) 

1706 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

1707 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

1708 

1709 exportUri = directoryUri.join(pathInStore) 

1710 exportUri.transfer_from(storeUri, transfer=transfer) 

1711 

1712 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

1713 

1714 @staticmethod 

1715 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1716 """Compute the checksum of the supplied file. 

1717 

1718 Parameters 

1719 ---------- 

1720 uri : `ButlerURI` 

1721 Name of resource to calculate checksum from. 

1722 algorithm : `str`, optional 

1723 Name of algorithm to use. Must be one of the algorithms supported 

1724 by :py:class`hashlib`. 

1725 block_size : `int` 

1726 Number of bytes to read from file at one time. 

1727 

1728 Returns 

1729 ------- 

1730 hexdigest : `str` 

1731 Hex digest of the file. 

1732 

1733 Notes 

1734 ----- 

1735 Currently returns None if the URI is for a remote resource. 

1736 """ 

1737 if algorithm not in hashlib.algorithms_guaranteed: 1737 ↛ 1738line 1737 didn't jump to line 1738, because the condition on line 1737 was never true

1738 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1739 

1740 if not uri.isLocal: 1740 ↛ 1741line 1740 didn't jump to line 1741, because the condition on line 1740 was never true

1741 return None 

1742 

1743 hasher = hashlib.new(algorithm) 

1744 

1745 with uri.as_local() as local_uri: 

1746 with open(local_uri.ospath, "rb") as f: 

1747 for chunk in iter(lambda: f.read(block_size), b""): 

1748 hasher.update(chunk) 

1749 

1750 return hasher.hexdigest()