Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileDatastore", ) 

26 

27import hashlib 

28import logging 

29import os 

30import tempfile 

31 

32from sqlalchemy import BigInteger, String 

33 

34from dataclasses import dataclass 

35from typing import ( 

36 TYPE_CHECKING, 

37 Any, 

38 ClassVar, 

39 Dict, 

40 Iterable, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.daf.butler import ( 

51 ButlerURI, 

52 CompositesMap, 

53 Config, 

54 FileDataset, 

55 DatasetRef, 

56 DatasetType, 

57 DatasetTypeNotSupportedError, 

58 Datastore, 

59 DatastoreCacheManager, 

60 DatastoreDisabledCacheManager, 

61 DatastoreConfig, 

62 DatastoreValidationError, 

63 FileDescriptor, 

64 FileTemplates, 

65 FileTemplateValidationError, 

66 Formatter, 

67 FormatterFactory, 

68 Location, 

69 LocationFactory, 

70 Progress, 

71 StorageClass, 

72 StoredFileInfo, 

73) 

74 

75from lsst.daf.butler import ddl 

76from lsst.daf.butler.registry.interfaces import ( 

77 ReadOnlyDatabaseError, 

78 DatastoreRegistryBridge, 

79) 

80 

81from lsst.daf.butler.core.repoRelocation import replaceRoot 

82from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

83from .genericDatastore import GenericBaseDatastore 

84 

85if TYPE_CHECKING: 85 ↛ 86line 85 didn't jump to line 86, because the condition on line 85 was never true

86 from lsst.daf.butler import LookupKey, AbstractDatastoreCacheManager 

87 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

88 

89log = logging.getLogger(__name__) 

90 

91# String to use when a Python None is encountered 

92NULLSTR = "__NULL_STRING__" 

93 

94 

95class _IngestPrepData(Datastore.IngestPrepData): 

96 """Helper class for FileDatastore ingest implementation. 

97 

98 Parameters 

99 ---------- 

100 datasets : `list` of `FileDataset` 

101 Files to be ingested by this datastore. 

102 """ 

103 def __init__(self, datasets: List[FileDataset]): 

104 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

105 self.datasets = datasets 

106 

107 

108@dataclass(frozen=True) 

109class DatastoreFileGetInformation: 

110 """Collection of useful parameters needed to retrieve a file from 

111 a Datastore. 

112 """ 

113 

114 location: Location 

115 """The location from which to read the dataset.""" 

116 

117 formatter: Formatter 

118 """The `Formatter` to use to deserialize the dataset.""" 

119 

120 info: StoredFileInfo 

121 """Stored information about this file and its formatter.""" 

122 

123 assemblerParams: Dict[str, Any] 

124 """Parameters to use for post-processing the retrieved dataset.""" 

125 

126 formatterParams: Dict[str, Any] 

127 """Parameters that were understood by the associated formatter.""" 

128 

129 component: Optional[str] 

130 """The component to be retrieved (can be `None`).""" 

131 

132 readStorageClass: StorageClass 

133 """The `StorageClass` of the dataset being read.""" 

134 

135 

136class FileDatastore(GenericBaseDatastore): 

137 """Generic Datastore for file-based implementations. 

138 

139 Should always be sub-classed since key abstract methods are missing. 

140 

141 Parameters 

142 ---------- 

143 config : `DatastoreConfig` or `str` 

144 Configuration as either a `Config` object or URI to file. 

145 bridgeManager : `DatastoreRegistryBridgeManager` 

146 Object that manages the interface between `Registry` and datastores. 

147 butlerRoot : `str`, optional 

148 New datastore root to use to override the configuration value. 

149 

150 Raises 

151 ------ 

152 ValueError 

153 If root location does not exist and ``create`` is `False` in the 

154 configuration. 

155 """ 

156 

157 defaultConfigFile: ClassVar[Optional[str]] = None 

158 """Path to configuration defaults. Accessed within the ``config`` resource 

159 or relative to a search path. Can be None if no defaults specified. 

160 """ 

161 

162 root: ButlerURI 

163 """Root directory URI of this `Datastore`.""" 

164 

165 locationFactory: LocationFactory 

166 """Factory for creating locations relative to the datastore root.""" 

167 

168 formatterFactory: FormatterFactory 

169 """Factory for creating instances of formatters.""" 

170 

171 templates: FileTemplates 

172 """File templates that can be used by this `Datastore`.""" 

173 

174 composites: CompositesMap 

175 """Determines whether a dataset should be disassembled on put.""" 

176 

177 defaultConfigFile = "datastores/fileDatastore.yaml" 

178 """Path to configuration defaults. Accessed within the ``config`` resource 

179 or relative to a search path. Can be None if no defaults specified. 

180 """ 

181 

182 @classmethod 

183 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

184 """Set any filesystem-dependent config options for this Datastore to 

185 be appropriate for a new empty repository with the given root. 

186 

187 Parameters 

188 ---------- 

189 root : `str` 

190 URI to the root of the data repository. 

191 config : `Config` 

192 A `Config` to update. Only the subset understood by 

193 this component will be updated. Will not expand 

194 defaults. 

195 full : `Config` 

196 A complete config with all defaults expanded that can be 

197 converted to a `DatastoreConfig`. Read-only and will not be 

198 modified by this method. 

199 Repository-specific options that should not be obtained 

200 from defaults when Butler instances are constructed 

201 should be copied from ``full`` to ``config``. 

202 overwrite : `bool`, optional 

203 If `False`, do not modify a value in ``config`` if the value 

204 already exists. Default is always to overwrite with the provided 

205 ``root``. 

206 

207 Notes 

208 ----- 

209 If a keyword is explicitly defined in the supplied ``config`` it 

210 will not be overridden by this method if ``overwrite`` is `False`. 

211 This allows explicit values set in external configs to be retained. 

212 """ 

213 Config.updateParameters(DatastoreConfig, config, full, 

214 toUpdate={"root": root}, 

215 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

216 

217 @classmethod 

218 def makeTableSpec(cls) -> ddl.TableSpec: 

219 return ddl.TableSpec( 

220 fields=[ 

221 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

222 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

223 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

224 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

225 # Use empty string to indicate no component 

226 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

227 # TODO: should checksum be Base64Bytes instead? 

228 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

229 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

230 ], 

231 unique=frozenset(), 

232 ) 

233 

234 def __init__(self, config: Union[DatastoreConfig, str], 

235 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

236 super().__init__(config, bridgeManager) 

237 if "root" not in self.config: 237 ↛ 238line 237 didn't jump to line 238, because the condition on line 237 was never true

238 raise ValueError("No root directory specified in configuration") 

239 

240 # Name ourselves either using an explicit name or a name 

241 # derived from the (unexpanded) root 

242 if "name" in self.config: 

243 self.name = self.config["name"] 

244 else: 

245 # We use the unexpanded root in the name to indicate that this 

246 # datastore can be moved without having to update registry. 

247 self.name = "{}@{}".format(type(self).__name__, 

248 self.config["root"]) 

249 

250 # Support repository relocation in config 

251 # Existence of self.root is checked in subclass 

252 self.root = ButlerURI(replaceRoot(self.config["root"], butlerRoot), 

253 forceDirectory=True, forceAbsolute=True) 

254 

255 self.locationFactory = LocationFactory(self.root) 

256 self.formatterFactory = FormatterFactory() 

257 

258 # Now associate formatters with storage classes 

259 self.formatterFactory.registerFormatters(self.config["formatters"], 

260 universe=bridgeManager.universe) 

261 

262 # Read the file naming templates 

263 self.templates = FileTemplates(self.config["templates"], 

264 universe=bridgeManager.universe) 

265 

266 # See if composites should be disassembled 

267 self.composites = CompositesMap(self.config["composites"], 

268 universe=bridgeManager.universe) 

269 

270 tableName = self.config["records", "table"] 

271 try: 

272 # Storage of paths and formatters, keyed by dataset_id 

273 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

274 # Interface to Registry. 

275 self._bridge = bridgeManager.register(self.name) 

276 except ReadOnlyDatabaseError: 

277 # If the database is read only and we just tried and failed to 

278 # create a table, it means someone is trying to create a read-only 

279 # butler client for an empty repo. That should be okay, as long 

280 # as they then try to get any datasets before some other client 

281 # creates the table. Chances are they'rejust validating 

282 # configuration. 

283 pass 

284 

285 # Determine whether checksums should be used - default to False 

286 self.useChecksum = self.config.get("checksum", False) 

287 

288 # Determine whether we can fall back to configuration if a 

289 # requested dataset is not known to registry 

290 self.trustGetRequest = self.config.get("trust_get_request", False) 

291 

292 # Create a cache manager 

293 self.cacheManager: AbstractDatastoreCacheManager 

294 if "cached" in self.config: 294 ↛ 298line 294 didn't jump to line 298, because the condition on line 294 was never false

295 self.cacheManager = DatastoreCacheManager(self.config["cached"], 

296 universe=bridgeManager.universe) 

297 else: 

298 self.cacheManager = DatastoreDisabledCacheManager("", 

299 universe=bridgeManager.universe) 

300 

301 # Check existence and create directory structure if necessary 

302 if not self.root.exists(): 

303 if "create" not in self.config or not self.config["create"]: 303 ↛ 304line 303 didn't jump to line 304, because the condition on line 303 was never true

304 raise ValueError(f"No valid root and not allowed to create one at: {self.root}") 

305 try: 

306 self.root.mkdir() 

307 except Exception as e: 

308 raise ValueError(f"Can not create datastore root '{self.root}', check permissions." 

309 f" Got error: {e}") from e 

310 

311 def __str__(self) -> str: 

312 return str(self.root) 

313 

314 @property 

315 def bridge(self) -> DatastoreRegistryBridge: 

316 return self._bridge 

317 

318 def _artifact_exists(self, location: Location) -> bool: 

319 """Check that an artifact exists in this datastore at the specified 

320 location. 

321 

322 Parameters 

323 ---------- 

324 location : `Location` 

325 Expected location of the artifact associated with this datastore. 

326 

327 Returns 

328 ------- 

329 exists : `bool` 

330 True if the location can be found, false otherwise. 

331 """ 

332 log.debug("Checking if resource exists: %s", location.uri) 

333 return location.uri.exists() 

334 

335 def _delete_artifact(self, location: Location) -> None: 

336 """Delete the artifact from the datastore. 

337 

338 Parameters 

339 ---------- 

340 location : `Location` 

341 Location of the artifact associated with this datastore. 

342 """ 

343 if location.pathInStore.isabs(): 343 ↛ 344line 343 didn't jump to line 344, because the condition on line 343 was never true

344 raise RuntimeError(f"Cannot delete artifact with absolute uri {location.uri}.") 

345 log.debug("Deleting file: %s", location.uri) 

346 location.uri.remove() 

347 log.debug("Successfully deleted file: %s", location.uri) 

348 

349 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

350 # Docstring inherited from GenericBaseDatastore 

351 records = [] 

352 for ref, info in zip(refs, infos): 

353 # Component should come from ref and fall back on info 

354 component = ref.datasetType.component() 

355 if component is None and info.component is not None: 355 ↛ 356line 355 didn't jump to line 356, because the condition on line 355 was never true

356 component = info.component 

357 if component is None: 

358 # Use empty string since we want this to be part of the 

359 # primary key. 

360 component = NULLSTR 

361 records.append( 

362 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

363 storage_class=info.storageClass.name, component=component, 

364 checksum=info.checksum, file_size=info.file_size) 

365 ) 

366 self._table.insert(*records) 

367 

368 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

369 # Docstring inherited from GenericBaseDatastore 

370 

371 # Look for the dataset_id -- there might be multiple matches 

372 # if we have disassembled the dataset. 

373 records = list(self._table.fetch(dataset_id=ref.id)) 

374 

375 results = [] 

376 for record in records: 

377 # Convert name of StorageClass to instance 

378 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

379 component = record["component"] if (record["component"] 

380 and record["component"] != NULLSTR) else None 

381 

382 info = StoredFileInfo(formatter=record["formatter"], 

383 path=record["path"], 

384 storageClass=storageClass, 

385 component=component, 

386 checksum=record["checksum"], 

387 file_size=record["file_size"]) 

388 results.append(info) 

389 

390 return results 

391 

392 def _registered_refs_per_artifact(self, pathInStore: ButlerURI) -> Set[int]: 

393 """Return all dataset refs associated with the supplied path. 

394 

395 Parameters 

396 ---------- 

397 pathInStore : `ButlerURI` 

398 Path of interest in the data store. 

399 

400 Returns 

401 ------- 

402 ids : `set` of `int` 

403 All `DatasetRef` IDs associated with this path. 

404 """ 

405 records = list(self._table.fetch(path=str(pathInStore))) 

406 ids = {r["dataset_id"] for r in records} 

407 return ids 

408 

409 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

410 # Docstring inherited from GenericBaseDatastore 

411 self._table.delete(["dataset_id"], {"dataset_id": ref.id}) 

412 

413 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

414 r"""Find all the `Location`\ s of the requested dataset in the 

415 `Datastore` and the associated stored file information. 

416 

417 Parameters 

418 ---------- 

419 ref : `DatasetRef` 

420 Reference to the required `Dataset`. 

421 

422 Returns 

423 ------- 

424 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

425 Location of the dataset within the datastore and 

426 stored information about each file and its formatter. 

427 """ 

428 # Get the file information (this will fail if no file) 

429 records = self.getStoredItemsInfo(ref) 

430 

431 # Use the path to determine the location -- we need to take 

432 # into account absolute URIs in the datastore record 

433 locations: List[Tuple[Location, StoredFileInfo]] = [] 

434 for r in records: 

435 uriInStore = ButlerURI(r.path, forceAbsolute=False) 

436 if uriInStore.isabs(): 436 ↛ 437line 436 didn't jump to line 437, because the condition on line 436 was never true

437 location = Location(None, uriInStore) 

438 else: 

439 location = self.locationFactory.fromPath(r.path) 

440 locations.append((location, r)) 

441 return locations 

442 

443 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

444 """Check that there is only one dataset associated with the 

445 specified artifact. 

446 

447 Parameters 

448 ---------- 

449 ref : `DatasetRef` or `FakeDatasetRef` 

450 Dataset to be removed. 

451 location : `Location` 

452 The location of the artifact to be removed. 

453 

454 Returns 

455 ------- 

456 can_remove : `Bool` 

457 True if the artifact can be safely removed. 

458 """ 

459 # Can't ever delete absolute URIs. 

460 if location.pathInStore.isabs(): 460 ↛ 461line 460 didn't jump to line 461, because the condition on line 460 was never true

461 return False 

462 

463 # Get all entries associated with this path 

464 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

465 if not allRefs: 465 ↛ 466line 465 didn't jump to line 466, because the condition on line 465 was never true

466 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

467 

468 # Remove these refs from all the refs and if there is nothing left 

469 # then we can delete 

470 remainingRefs = allRefs - {ref.id} 

471 

472 if remainingRefs: 

473 return False 

474 return True 

475 

476 def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> List[Tuple[Location, 

477 StoredFileInfo]]: 

478 """Predict the location and related file information of the requested 

479 dataset in this datastore. 

480 

481 Parameters 

482 ---------- 

483 ref : `DatasetRef` 

484 Reference to the required `Dataset`. 

485 

486 Returns 

487 ------- 

488 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

489 Expected Location of the dataset within the datastore and 

490 placeholder information about each file and its formatter. 

491 

492 Notes 

493 ----- 

494 Uses the current configuration to determine how we would expect the 

495 datastore files to have been written if we couldn't ask registry. 

496 This is safe so long as there has been no change to datastore 

497 configuration between writing the dataset and wanting to read it. 

498 Will not work for files that have been ingested without using the 

499 standard file template or default formatter. 

500 """ 

501 

502 # If we have a component ref we always need to ask the questions 

503 # of the composite. If the composite is disassembled this routine 

504 # should return all components. If the composite was not 

505 # disassembled the composite is what is stored regardless of 

506 # component request. Note that if the caller has disassembled 

507 # a composite there is no way for this guess to know that 

508 # without trying both the composite and component ref and seeing 

509 # if there is something at the component Location even without 

510 # disassembly being enabled. 

511 if ref.datasetType.isComponent(): 

512 ref = ref.makeCompositeRef() 

513 

514 # See if the ref is a composite that should be disassembled 

515 doDisassembly = self.composites.shouldBeDisassembled(ref) 

516 

517 all_info: List[Tuple[Location, Formatter, StorageClass, Optional[str]]] = [] 

518 

519 if doDisassembly: 

520 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

521 compRef = ref.makeComponentRef(component) 

522 location, formatter = self._determine_put_formatter_location(compRef) 

523 all_info.append((location, formatter, componentStorage, component)) 

524 

525 else: 

526 # Always use the composite ref if no disassembly 

527 location, formatter = self._determine_put_formatter_location(ref) 

528 all_info.append((location, formatter, ref.datasetType.storageClass, None)) 

529 

530 # Convert the list of tuples to have StoredFileInfo as second element 

531 return [(location, StoredFileInfo(formatter=formatter, 

532 path=location.pathInStore.path, 

533 storageClass=storageClass, 

534 component=component, 

535 checksum=None, 

536 file_size=-1)) 

537 for location, formatter, storageClass, component in all_info] 

538 

539 def _prepare_for_get(self, ref: DatasetRef, 

540 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

541 """Check parameters for ``get`` and obtain formatter and 

542 location. 

543 

544 Parameters 

545 ---------- 

546 ref : `DatasetRef` 

547 Reference to the required Dataset. 

548 parameters : `dict` 

549 `StorageClass`-specific parameters that specify, for example, 

550 a slice of the dataset to be loaded. 

551 

552 Returns 

553 ------- 

554 getInfo : `list` [`DatastoreFileGetInformation`] 

555 Parameters needed to retrieve each file. 

556 """ 

557 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

558 

559 # Get file metadata and internal metadata 

560 fileLocations = self._get_dataset_locations_info(ref) 

561 if not fileLocations: 

562 if not self.trustGetRequest: 

563 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

564 # Assume the dataset is where we think it should be 

565 fileLocations = self._get_expected_dataset_locations_info(ref) 

566 

567 # The storage class we want to use eventually 

568 refStorageClass = ref.datasetType.storageClass 

569 

570 if len(fileLocations) > 1: 

571 disassembled = True 

572 else: 

573 disassembled = False 

574 

575 # Is this a component request? 

576 refComponent = ref.datasetType.component() 

577 

578 fileGetInfo = [] 

579 for location, storedFileInfo in fileLocations: 

580 

581 # The storage class used to write the file 

582 writeStorageClass = storedFileInfo.storageClass 

583 

584 # If this has been disassembled we need read to match the write 

585 if disassembled: 

586 readStorageClass = writeStorageClass 

587 else: 

588 readStorageClass = refStorageClass 

589 

590 formatter = getInstanceOf(storedFileInfo.formatter, 

591 FileDescriptor(location, readStorageClass=readStorageClass, 

592 storageClass=writeStorageClass, parameters=parameters), 

593 ref.dataId) 

594 

595 formatterParams, notFormatterParams = formatter.segregateParameters() 

596 

597 # Of the remaining parameters, extract the ones supported by 

598 # this StorageClass (for components not all will be handled) 

599 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

600 

601 # The ref itself could be a component if the dataset was 

602 # disassembled by butler, or we disassembled in datastore and 

603 # components came from the datastore records 

604 component = storedFileInfo.component if storedFileInfo.component else refComponent 

605 

606 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

607 assemblerParams, formatterParams, 

608 component, readStorageClass)) 

609 

610 return fileGetInfo 

611 

612 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

613 """Check the arguments for ``put`` and obtain formatter and 

614 location. 

615 

616 Parameters 

617 ---------- 

618 inMemoryDataset : `object` 

619 The dataset to store. 

620 ref : `DatasetRef` 

621 Reference to the associated Dataset. 

622 

623 Returns 

624 ------- 

625 location : `Location` 

626 The location to write the dataset. 

627 formatter : `Formatter` 

628 The `Formatter` to use to write the dataset. 

629 

630 Raises 

631 ------ 

632 TypeError 

633 Supplied object and storage class are inconsistent. 

634 DatasetTypeNotSupportedError 

635 The associated `DatasetType` is not handled by this datastore. 

636 """ 

637 self._validate_put_parameters(inMemoryDataset, ref) 

638 return self._determine_put_formatter_location(ref) 

639 

640 def _determine_put_formatter_location(self, ref: DatasetRef) -> Tuple[Location, Formatter]: 

641 """Calculate the formatter and output location to use for put. 

642 

643 Parameters 

644 ---------- 

645 ref : `DatasetRef` 

646 Reference to the associated Dataset. 

647 

648 Returns 

649 ------- 

650 location : `Location` 

651 The location to write the dataset. 

652 formatter : `Formatter` 

653 The `Formatter` to use to write the dataset. 

654 """ 

655 # Work out output file name 

656 try: 

657 template = self.templates.getTemplate(ref) 

658 except KeyError as e: 

659 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

660 

661 # Validate the template to protect against filenames from different 

662 # dataIds returning the same and causing overwrite confusion. 

663 template.validateTemplate(ref) 

664 

665 location = self.locationFactory.fromPath(template.format(ref)) 

666 

667 # Get the formatter based on the storage class 

668 storageClass = ref.datasetType.storageClass 

669 try: 

670 formatter = self.formatterFactory.getFormatter(ref, 

671 FileDescriptor(location, 

672 storageClass=storageClass), 

673 ref.dataId) 

674 except KeyError as e: 

675 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref} in datastore " 

676 f"{self.name}") from e 

677 

678 # Now that we know the formatter, update the location 

679 location = formatter.makeUpdatedLocation(location) 

680 

681 return location, formatter 

682 

683 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

684 # Docstring inherited from base class 

685 if transfer != "auto": 

686 return transfer 

687 

688 # See if the paths are within the datastore or not 

689 inside = [self._pathInStore(d.path) is not None for d in datasets] 

690 

691 if all(inside): 

692 transfer = None 

693 elif not any(inside): 693 ↛ 697line 693 didn't jump to line 697, because the condition on line 693 was never false

694 # Allow ButlerURI to use its own knowledge 

695 transfer = "auto" 

696 else: 

697 raise ValueError("Some datasets are inside the datastore and some are outside." 

698 " Please use an explicit transfer mode and not 'auto'.") 

699 

700 return transfer 

701 

702 def _pathInStore(self, path: Union[str, ButlerURI]) -> Optional[str]: 

703 """Return path relative to datastore root 

704 

705 Parameters 

706 ---------- 

707 path : `str` or `ButlerURI` 

708 Path to dataset. Can be absolute URI. If relative assumed to 

709 be relative to the datastore. Returns path in datastore 

710 or raises an exception if the path it outside. 

711 

712 Returns 

713 ------- 

714 inStore : `str` 

715 Path relative to datastore root. Returns `None` if the file is 

716 outside the root. 

717 """ 

718 # Relative path will always be relative to datastore 

719 pathUri = ButlerURI(path, forceAbsolute=False) 

720 return pathUri.relative_to(self.root) 

721 

722 def _standardizeIngestPath(self, path: Union[str, ButlerURI], *, 

723 transfer: Optional[str] = None) -> Union[str, ButlerURI]: 

724 """Standardize the path of a to-be-ingested file. 

725 

726 Parameters 

727 ---------- 

728 path : `str` or `ButlerURI` 

729 Path of a file to be ingested. 

730 transfer : `str`, optional 

731 How (and whether) the dataset should be added to the datastore. 

732 See `ingest` for details of transfer modes. 

733 This implementation is provided only so 

734 `NotImplementedError` can be raised if the mode is not supported; 

735 actual transfers are deferred to `_extractIngestInfo`. 

736 

737 Returns 

738 ------- 

739 path : `str` or `ButlerURI` 

740 New path in what the datastore considers standard form. If an 

741 absolute URI was given that will be returned unchanged. 

742 

743 Notes 

744 ----- 

745 Subclasses of `FileDatastore` can implement this method instead 

746 of `_prepIngest`. It should not modify the data repository or given 

747 file in any way. 

748 

749 Raises 

750 ------ 

751 NotImplementedError 

752 Raised if the datastore does not support the given transfer mode 

753 (including the case where ingest is not supported at all). 

754 FileNotFoundError 

755 Raised if one of the given files does not exist. 

756 """ 

757 if transfer not in (None, "direct") + self.root.transferModes: 757 ↛ 758line 757 didn't jump to line 758, because the condition on line 757 was never true

758 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

759 

760 # A relative URI indicates relative to datastore root 

761 srcUri = ButlerURI(path, forceAbsolute=False) 

762 if not srcUri.isabs(): 

763 srcUri = self.root.join(path) 

764 

765 if not srcUri.exists(): 

766 raise FileNotFoundError(f"Resource at {srcUri} does not exist; note that paths to ingest " 

767 f"are assumed to be relative to {self.root} unless they are absolute.") 

768 

769 if transfer is None: 

770 relpath = srcUri.relative_to(self.root) 

771 if not relpath: 

772 raise RuntimeError(f"Transfer is none but source file ({srcUri}) is not " 

773 f"within datastore ({self.root})") 

774 

775 # Return the relative path within the datastore for internal 

776 # transfer 

777 path = relpath 

778 

779 return path 

780 

781 def _extractIngestInfo(self, path: Union[str, ButlerURI], ref: DatasetRef, *, 

782 formatter: Union[Formatter, Type[Formatter]], 

783 transfer: Optional[str] = None) -> StoredFileInfo: 

784 """Relocate (if necessary) and extract `StoredFileInfo` from a 

785 to-be-ingested file. 

786 

787 Parameters 

788 ---------- 

789 path : `str` or `ButlerURI` 

790 URI or path of a file to be ingested. 

791 ref : `DatasetRef` 

792 Reference for the dataset being ingested. Guaranteed to have 

793 ``dataset_id not None`. 

794 formatter : `type` or `Formatter` 

795 `Formatter` subclass to use for this dataset or an instance. 

796 transfer : `str`, optional 

797 How (and whether) the dataset should be added to the datastore. 

798 See `ingest` for details of transfer modes. 

799 

800 Returns 

801 ------- 

802 info : `StoredFileInfo` 

803 Internal datastore record for this file. This will be inserted by 

804 the caller; the `_extractIngestInfo` is only resposible for 

805 creating and populating the struct. 

806 

807 Raises 

808 ------ 

809 FileNotFoundError 

810 Raised if one of the given files does not exist. 

811 FileExistsError 

812 Raised if transfer is not `None` but the (internal) location the 

813 file would be moved to is already occupied. 

814 """ 

815 if self._transaction is None: 815 ↛ 816line 815 didn't jump to line 816, because the condition on line 815 was never true

816 raise RuntimeError("Ingest called without transaction enabled") 

817 

818 # Create URI of the source path, do not need to force a relative 

819 # path to absolute. 

820 srcUri = ButlerURI(path, forceAbsolute=False) 

821 

822 # Track whether we have read the size of the source yet 

823 have_sized = False 

824 

825 tgtLocation: Optional[Location] 

826 if transfer is None: 

827 # A relative path is assumed to be relative to the datastore 

828 # in this context 

829 if not srcUri.isabs(): 

830 tgtLocation = self.locationFactory.fromPath(srcUri.ospath) 

831 else: 

832 # Work out the path in the datastore from an absolute URI 

833 # This is required to be within the datastore. 

834 pathInStore = srcUri.relative_to(self.root) 

835 if pathInStore is None: 835 ↛ 836line 835 didn't jump to line 836, because the condition on line 835 was never true

836 raise RuntimeError(f"Unexpectedly learned that {srcUri} is " 

837 f"not within datastore {self.root}") 

838 tgtLocation = self.locationFactory.fromPath(pathInStore) 

839 elif transfer == "direct": 839 ↛ 844line 839 didn't jump to line 844, because the condition on line 839 was never true

840 # Want to store the full URI to the resource directly in 

841 # datastore. This is useful for referring to permanent archive 

842 # storage for raw data. 

843 # Trust that people know what they are doing. 

844 tgtLocation = None 

845 else: 

846 # Work out the name we want this ingested file to have 

847 # inside the datastore 

848 tgtLocation = self._calculate_ingested_datastore_name(srcUri, ref, formatter) 

849 if not tgtLocation.uri.dirname().exists(): 

850 log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) 

851 tgtLocation.uri.dirname().mkdir() 

852 

853 # if we are transferring from a local file to a remote location 

854 # it may be more efficient to get the size and checksum of the 

855 # local file rather than the transferred one 

856 if not srcUri.scheme or srcUri.scheme == "file": 856 ↛ 862line 856 didn't jump to line 862, because the condition on line 856 was never false

857 size = srcUri.size() 

858 checksum = self.computeChecksum(srcUri) if self.useChecksum else None 

859 have_sized = True 

860 

861 # transfer the resource to the destination 

862 tgtLocation.uri.transfer_from(srcUri, transfer=transfer, transaction=self._transaction) 

863 

864 if tgtLocation is None: 864 ↛ 866line 864 didn't jump to line 866, because the condition on line 864 was never true

865 # This means we are using direct mode 

866 targetUri = srcUri 

867 targetPath = str(srcUri) 

868 else: 

869 targetUri = tgtLocation.uri 

870 targetPath = tgtLocation.pathInStore.path 

871 

872 # the file should exist in the datastore now 

873 if not have_sized: 

874 size = targetUri.size() 

875 checksum = self.computeChecksum(targetUri) if self.useChecksum else None 

876 

877 return StoredFileInfo(formatter=formatter, path=targetPath, 

878 storageClass=ref.datasetType.storageClass, 

879 component=ref.datasetType.component(), 

880 file_size=size, checksum=checksum) 

881 

882 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

883 # Docstring inherited from Datastore._prepIngest. 

884 filtered = [] 

885 for dataset in datasets: 

886 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

887 if not acceptable: 

888 continue 

889 else: 

890 dataset.refs = acceptable 

891 if dataset.formatter is None: 

892 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

893 else: 

894 assert isinstance(dataset.formatter, (type, str)) 

895 dataset.formatter = getClassOf(dataset.formatter) 

896 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

897 filtered.append(dataset) 

898 return _IngestPrepData(filtered) 

899 

900 @transactional 

901 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

902 # Docstring inherited from Datastore._finishIngest. 

903 refsAndInfos = [] 

904 progress = Progress("lsst.daf.butler.datastores.FileDatastore.ingest", level=logging.DEBUG) 

905 for dataset in progress.wrap(prepData.datasets, desc="Ingesting dataset files"): 

906 # Do ingest as if the first dataset ref is associated with the file 

907 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

908 transfer=transfer) 

909 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

910 self._register_datasets(refsAndInfos) 

911 

912 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

913 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

914 """Given a source URI and a DatasetRef, determine the name the 

915 dataset will have inside datastore. 

916 

917 Parameters 

918 ---------- 

919 srcUri : `ButlerURI` 

920 URI to the source dataset file. 

921 ref : `DatasetRef` 

922 Ref associated with the newly-ingested dataset artifact. This 

923 is used to determine the name within the datastore. 

924 formatter : `Formatter` or Formatter class. 

925 Formatter to use for validation. Can be a class or an instance. 

926 

927 Returns 

928 ------- 

929 location : `Location` 

930 Target location for the newly-ingested dataset. 

931 """ 

932 # Ingesting a file from outside the datastore. 

933 # This involves a new name. 

934 template = self.templates.getTemplate(ref) 

935 location = self.locationFactory.fromPath(template.format(ref)) 

936 

937 # Get the extension 

938 ext = srcUri.getExtension() 

939 

940 # Update the destination to include that extension 

941 location.updateExtension(ext) 

942 

943 # Ask the formatter to validate this extension 

944 formatter.validateExtension(location) 

945 

946 return location 

947 

948 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

949 """Write out in memory dataset to datastore. 

950 

951 Parameters 

952 ---------- 

953 inMemoryDataset : `object` 

954 Dataset to write to datastore. 

955 ref : `DatasetRef` 

956 Registry information associated with this dataset. 

957 

958 Returns 

959 ------- 

960 info : `StoredFileInfo` 

961 Information describin the artifact written to the datastore. 

962 """ 

963 location, formatter = self._prepare_for_put(inMemoryDataset, ref) 

964 uri = location.uri 

965 

966 if not uri.dirname().exists(): 

967 log.debug("Folder %s does not exist yet so creating it.", uri.dirname()) 

968 uri.dirname().mkdir() 

969 

970 if self._transaction is None: 970 ↛ 971line 970 didn't jump to line 971, because the condition on line 970 was never true

971 raise RuntimeError("Attempting to write artifact without transaction enabled") 

972 

973 def _removeFileExists(uri: ButlerURI) -> None: 

974 """Remove a file and do not complain if it is not there. 

975 

976 This is important since a formatter might fail before the file 

977 is written and we should not confuse people by writing spurious 

978 error messages to the log. 

979 """ 

980 try: 

981 uri.remove() 

982 except FileNotFoundError: 

983 pass 

984 

985 # Register a callback to try to delete the uploaded data if 

986 # something fails below 

987 self._transaction.registerUndo("artifactWrite", _removeFileExists, uri) 

988 

989 # For a local file, simply use the formatter directly 

990 if uri.isLocal: 

991 formatter.write(inMemoryDataset) 

992 log.debug("Successfully wrote python object to local file at %s", uri) 

993 else: 

994 # This is a remote URI, so first try bytes and write directly else 

995 # fallback to a temporary file 

996 try: 

997 serializedDataset = formatter.toBytes(inMemoryDataset) 

998 log.debug("Writing bytes directly to %s", uri) 

999 uri.write(serializedDataset, overwrite=True) 

1000 log.debug("Successfully wrote bytes directly to %s", uri) 

1001 except NotImplementedError: 

1002 with tempfile.NamedTemporaryFile(suffix=uri.getExtension()) as tmpFile: 

1003 # Need to configure the formatter to write to a different 

1004 # location and that needs us to overwrite internals 

1005 tmpLocation = Location(*os.path.split(tmpFile.name)) 

1006 log.debug("Writing dataset to temporary location at %s", tmpLocation.uri) 

1007 with formatter._updateLocation(tmpLocation): 

1008 formatter.write(inMemoryDataset) 

1009 uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) 

1010 

1011 # Cache if required 

1012 self.cacheManager.move_to_cache(tmpLocation.uri, ref) 

1013 

1014 log.debug("Successfully wrote dataset to %s via a temporary file.", uri) 

1015 

1016 # URI is needed to resolve what ingest case are we dealing with 

1017 return self._extractIngestInfo(uri, ref, formatter=formatter) 

1018 

1019 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

1020 ref: DatasetRef, isComponent: bool = False) -> Any: 

1021 """Read the artifact from datastore into in memory object. 

1022 

1023 Parameters 

1024 ---------- 

1025 getInfo : `DatastoreFileGetInformation` 

1026 Information about the artifact within the datastore. 

1027 ref : `DatasetRef` 

1028 The registry information associated with this artifact. 

1029 isComponent : `bool` 

1030 Flag to indicate if a component is being read from this artifact. 

1031 

1032 Returns 

1033 ------- 

1034 inMemoryDataset : `object` 

1035 The artifact as a python object. 

1036 """ 

1037 location = getInfo.location 

1038 uri = location.uri 

1039 log.debug("Accessing data from %s", uri) 

1040 

1041 # Cannot recalculate checksum but can compare size as a quick check 

1042 # Do not do this if the size is negative since that indicates 

1043 # we do not know. 

1044 recorded_size = getInfo.info.file_size 

1045 resource_size = uri.size() 

1046 if recorded_size >= 0 and resource_size != recorded_size: 1046 ↛ 1047line 1046 didn't jump to line 1047, because the condition on line 1046 was never true

1047 raise RuntimeError("Integrity failure in Datastore. " 

1048 f"Size of file {uri} ({resource_size}) " 

1049 f"does not match size recorded in registry of {recorded_size}") 

1050 

1051 # For the general case we have choices for how to proceed. 

1052 # 1. Always use a local file (downloading the remote resource to a 

1053 # temporary file if needed). 

1054 # 2. Use a threshold size and read into memory and use bytes. 

1055 # Use both for now with an arbitrary hand off size. 

1056 # This allows small datasets to be downloaded from remote object 

1057 # stores without requiring a temporary file. 

1058 

1059 formatter = getInfo.formatter 

1060 nbytes_max = 10_000_000 # Arbitrary number that we can tune 

1061 if resource_size <= nbytes_max and formatter.can_read_bytes(): 

1062 serializedDataset = uri.read() 

1063 log.debug("Deserializing %s from %d bytes from location %s with formatter %s", 

1064 f"component {getInfo.component}" if isComponent else "", 

1065 len(serializedDataset), uri, formatter.name()) 

1066 try: 

1067 result = formatter.fromBytes(serializedDataset, 

1068 component=getInfo.component if isComponent else None) 

1069 except Exception as e: 

1070 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1071 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1072 else: 

1073 # Read from file. 

1074 

1075 # Have to update the Location associated with the formatter 

1076 # because formatter.read does not allow an override. 

1077 # This could be improved. 

1078 location_updated = False 

1079 msg = "" 

1080 

1081 # First check in cache for local version. 

1082 # The cache will only be relevant for remote resources. 

1083 if not uri.isLocal: 

1084 cached_file = self.cacheManager.find_in_cache(ref, uri.getExtension()) 

1085 if cached_file is not None: 1085 ↛ 1086line 1085 didn't jump to line 1086, because the condition on line 1085 was never true

1086 msg = f"(via cache read of remote file {uri})" 

1087 uri = cached_file 

1088 location_updated = True 

1089 

1090 with uri.as_local() as local_uri: 

1091 

1092 # URI was remote and file was downloaded 

1093 if uri != local_uri: 

1094 cache_msg = "" 

1095 location_updated = True 

1096 

1097 # Cache the downloaded file if needed. 

1098 cached_uri = self.cacheManager.move_to_cache(local_uri, ref) 

1099 if cached_uri is not None: 1099 ↛ 1100line 1099 didn't jump to line 1100, because the condition on line 1099 was never true

1100 local_uri = cached_uri 

1101 cache_msg = " and cached" 

1102 

1103 msg = f"(via download to local file{cache_msg})" 

1104 

1105 # Calculate the (possibly) new location for the formatter 

1106 # to use. 

1107 newLocation = Location(*local_uri.split()) if location_updated else None 

1108 

1109 log.debug("Reading%s from location %s %s with formatter %s", 

1110 f" component {getInfo.component}" if isComponent else "", 

1111 uri, msg, formatter.name()) 

1112 try: 

1113 with formatter._updateLocation(newLocation): 

1114 result = formatter.read(component=getInfo.component if isComponent else None) 

1115 except Exception as e: 

1116 raise ValueError(f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" 

1117 f" ({ref.datasetType.name} from {uri}): {e}") from e 

1118 

1119 return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, 

1120 isComponent=isComponent) 

1121 

1122 def exists(self, ref: DatasetRef) -> bool: 

1123 """Check if the dataset exists in the datastore. 

1124 

1125 Parameters 

1126 ---------- 

1127 ref : `DatasetRef` 

1128 Reference to the required dataset. 

1129 

1130 Returns 

1131 ------- 

1132 exists : `bool` 

1133 `True` if the entity exists in the `Datastore`. 

1134 """ 

1135 fileLocations = self._get_dataset_locations_info(ref) 

1136 

1137 # if we are being asked to trust that registry might not be correct 

1138 # we ask for the expected locations and check them explicitly 

1139 if not fileLocations: 

1140 if not self.trustGetRequest: 

1141 return False 

1142 fileLocations = self._get_expected_dataset_locations_info(ref) 

1143 for location, _ in fileLocations: 

1144 if not self._artifact_exists(location): 

1145 return False 

1146 

1147 return True 

1148 

1149 def getURIs(self, ref: DatasetRef, 

1150 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1151 """Return URIs associated with dataset. 

1152 

1153 Parameters 

1154 ---------- 

1155 ref : `DatasetRef` 

1156 Reference to the required dataset. 

1157 predict : `bool`, optional 

1158 If the datastore does not know about the dataset, should it 

1159 return a predicted URI or not? 

1160 

1161 Returns 

1162 ------- 

1163 primary : `ButlerURI` 

1164 The URI to the primary artifact associated with this dataset. 

1165 If the dataset was disassembled within the datastore this 

1166 may be `None`. 

1167 components : `dict` 

1168 URIs to any components associated with the dataset artifact. 

1169 Can be empty if there are no components. 

1170 """ 

1171 

1172 primary: Optional[ButlerURI] = None 

1173 components: Dict[str, ButlerURI] = {} 

1174 

1175 # if this has never been written then we have to guess 

1176 if not self.exists(ref): 

1177 if not predict: 

1178 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

1179 

1180 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1181 

1182 if doDisassembly: 

1183 

1184 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

1185 compRef = ref.makeComponentRef(component) 

1186 compLocation, _ = self._determine_put_formatter_location(compRef) 

1187 

1188 # Add a URI fragment to indicate this is a guess 

1189 components[component] = ButlerURI(compLocation.uri.geturl() + "#predicted") 

1190 

1191 else: 

1192 

1193 location, _ = self._determine_put_formatter_location(ref) 

1194 

1195 # Add a URI fragment to indicate this is a guess 

1196 primary = ButlerURI(location.uri.geturl() + "#predicted") 

1197 

1198 return primary, components 

1199 

1200 # If this is a ref that we have written we can get the path. 

1201 # Get file metadata and internal metadata 

1202 fileLocations = self._get_dataset_locations_info(ref) 

1203 

1204 guessing = False 

1205 if not fileLocations: 

1206 if not self.trustGetRequest: 1206 ↛ 1207line 1206 didn't jump to line 1207, because the condition on line 1206 was never true

1207 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

1208 fileLocations = self._get_expected_dataset_locations_info(ref) 

1209 guessing = True 

1210 

1211 if len(fileLocations) == 1: 

1212 # No disassembly so this is the primary URI 

1213 uri = fileLocations[0][0].uri 

1214 if guessing and not uri.exists(): 1214 ↛ 1215line 1214 didn't jump to line 1215, because the condition on line 1214 was never true

1215 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1216 primary = uri 

1217 

1218 else: 

1219 for location, storedFileInfo in fileLocations: 

1220 if storedFileInfo.component is None: 1220 ↛ 1221line 1220 didn't jump to line 1221, because the condition on line 1220 was never true

1221 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

1222 uri = location.uri 

1223 if guessing and not uri.exists(): 1223 ↛ 1224line 1223 didn't jump to line 1224, because the condition on line 1223 was never true

1224 raise FileNotFoundError(f"Expected URI ({uri}) does not exist") 

1225 components[storedFileInfo.component] = uri 

1226 

1227 return primary, components 

1228 

1229 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

1230 """URI to the Dataset. 

1231 

1232 Parameters 

1233 ---------- 

1234 ref : `DatasetRef` 

1235 Reference to the required Dataset. 

1236 predict : `bool` 

1237 If `True`, allow URIs to be returned of datasets that have not 

1238 been written. 

1239 

1240 Returns 

1241 ------- 

1242 uri : `str` 

1243 URI pointing to the dataset within the datastore. If the 

1244 dataset does not exist in the datastore, and if ``predict`` is 

1245 `True`, the URI will be a prediction and will include a URI 

1246 fragment "#predicted". 

1247 If the datastore does not have entities that relate well 

1248 to the concept of a URI the returned URI will be 

1249 descriptive. The returned URI is not guaranteed to be obtainable. 

1250 

1251 Raises 

1252 ------ 

1253 FileNotFoundError 

1254 Raised if a URI has been requested for a dataset that does not 

1255 exist and guessing is not allowed. 

1256 RuntimeError 

1257 Raised if a request is made for a single URI but multiple URIs 

1258 are associated with this dataset. 

1259 

1260 Notes 

1261 ----- 

1262 When a predicted URI is requested an attempt will be made to form 

1263 a reasonable URI based on file templates and the expected formatter. 

1264 """ 

1265 primary, components = self.getURIs(ref, predict) 

1266 if primary is None or components: 1266 ↛ 1267line 1266 didn't jump to line 1267, because the condition on line 1266 was never true

1267 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

1268 "Use Dataastore.getURIs() instead.") 

1269 return primary 

1270 

1271 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

1272 """Load an InMemoryDataset from the store. 

1273 

1274 Parameters 

1275 ---------- 

1276 ref : `DatasetRef` 

1277 Reference to the required Dataset. 

1278 parameters : `dict` 

1279 `StorageClass`-specific parameters that specify, for example, 

1280 a slice of the dataset to be loaded. 

1281 

1282 Returns 

1283 ------- 

1284 inMemoryDataset : `object` 

1285 Requested dataset or slice thereof as an InMemoryDataset. 

1286 

1287 Raises 

1288 ------ 

1289 FileNotFoundError 

1290 Requested dataset can not be retrieved. 

1291 TypeError 

1292 Return value from formatter has unexpected type. 

1293 ValueError 

1294 Formatter failed to process the dataset. 

1295 """ 

1296 allGetInfo = self._prepare_for_get(ref, parameters) 

1297 refComponent = ref.datasetType.component() 

1298 

1299 # Supplied storage class for the component being read 

1300 refStorageClass = ref.datasetType.storageClass 

1301 

1302 # Create mapping from component name to related info 

1303 allComponents = {i.component: i for i in allGetInfo} 

1304 

1305 # By definition the dataset is disassembled if we have more 

1306 # than one record for it. 

1307 isDisassembled = len(allGetInfo) > 1 

1308 

1309 # Look for the special case where we are disassembled but the 

1310 # component is a derived component that was not written during 

1311 # disassembly. For this scenario we need to check that the 

1312 # component requested is listed as a derived component for the 

1313 # composite storage class 

1314 isDisassembledReadOnlyComponent = False 

1315 if isDisassembled and refComponent: 

1316 # The composite storage class should be accessible through 

1317 # the component dataset type 

1318 compositeStorageClass = ref.datasetType.parentStorageClass 

1319 

1320 # In the unlikely scenario where the composite storage 

1321 # class is not known, we can only assume that this is a 

1322 # normal component. If that assumption is wrong then the 

1323 # branch below that reads a persisted component will fail 

1324 # so there is no need to complain here. 

1325 if compositeStorageClass is not None: 1325 ↛ 1328line 1325 didn't jump to line 1328, because the condition on line 1325 was never false

1326 isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents 

1327 

1328 if isDisassembled and not refComponent: 

1329 # This was a disassembled dataset spread over multiple files 

1330 # and we need to put them all back together again. 

1331 # Read into memory and then assemble 

1332 

1333 # Check that the supplied parameters are suitable for the type read 

1334 refStorageClass.validateParameters(parameters) 

1335 

1336 # We want to keep track of all the parameters that were not used 

1337 # by formatters. We assume that if any of the component formatters 

1338 # use a parameter that we do not need to apply it again in the 

1339 # assembler. 

1340 usedParams = set() 

1341 

1342 components: Dict[str, Any] = {} 

1343 for getInfo in allGetInfo: 

1344 # assemblerParams are parameters not understood by the 

1345 # associated formatter. 

1346 usedParams.update(set(getInfo.formatterParams)) 

1347 

1348 component = getInfo.component 

1349 

1350 if component is None: 1350 ↛ 1351line 1350 didn't jump to line 1351, because the condition on line 1350 was never true

1351 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1352 

1353 # We do not want the formatter to think it's reading 

1354 # a component though because it is really reading a 

1355 # standalone dataset -- always tell reader it is not a 

1356 # component. 

1357 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1358 

1359 inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) 

1360 

1361 # Any unused parameters will have to be passed to the assembler 

1362 if parameters: 

1363 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1364 else: 

1365 unusedParams = {} 

1366 

1367 # Process parameters 

1368 return ref.datasetType.storageClass.delegate().handleParameters(inMemoryDataset, 

1369 parameters=unusedParams) 

1370 

1371 elif isDisassembledReadOnlyComponent: 

1372 

1373 compositeStorageClass = ref.datasetType.parentStorageClass 

1374 if compositeStorageClass is None: 1374 ↛ 1375line 1374 didn't jump to line 1375, because the condition on line 1374 was never true

1375 raise RuntimeError(f"Unable to retrieve derived component '{refComponent}' since" 

1376 "no composite storage class is available.") 

1377 

1378 if refComponent is None: 1378 ↛ 1380line 1378 didn't jump to line 1380, because the condition on line 1378 was never true

1379 # Mainly for mypy 

1380 raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") 

1381 

1382 # Assume that every derived component can be calculated by 

1383 # forwarding the request to a single read/write component. 

1384 # Rather than guessing which rw component is the right one by 

1385 # scanning each for a derived component of the same name, 

1386 # we ask the storage class delegate directly which one is best to 

1387 # use. 

1388 compositeDelegate = compositeStorageClass.delegate() 

1389 forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, 

1390 set(allComponents)) 

1391 

1392 # Select the relevant component 

1393 rwInfo = allComponents[forwardedComponent] 

1394 

1395 # For now assume that read parameters are validated against 

1396 # the real component and not the requested component 

1397 forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass 

1398 forwardedStorageClass.validateParameters(parameters) 

1399 

1400 # Unfortunately the FileDescriptor inside the formatter will have 

1401 # the wrong write storage class so we need to create a new one 

1402 # given the immutability constraint. 

1403 writeStorageClass = rwInfo.info.storageClass 

1404 

1405 # We may need to put some thought into parameters for read 

1406 # components but for now forward them on as is 

1407 readFormatter = type(rwInfo.formatter)(FileDescriptor(rwInfo.location, 

1408 readStorageClass=refStorageClass, 

1409 storageClass=writeStorageClass, 

1410 parameters=parameters), 

1411 ref.dataId) 

1412 

1413 # The assembler can not receive any parameter requests for a 

1414 # derived component at this time since the assembler will 

1415 # see the storage class of the derived component and those 

1416 # parameters will have to be handled by the formatter on the 

1417 # forwarded storage class. 

1418 assemblerParams: Dict[str, Any] = {} 

1419 

1420 # Need to created a new info that specifies the derived 

1421 # component and associated storage class 

1422 readInfo = DatastoreFileGetInformation(rwInfo.location, readFormatter, 

1423 rwInfo.info, assemblerParams, {}, 

1424 refComponent, refStorageClass) 

1425 

1426 return self._read_artifact_into_memory(readInfo, ref, isComponent=True) 

1427 

1428 else: 

1429 # Single file request or component from that composite file 

1430 for lookup in (refComponent, None): 1430 ↛ 1435line 1430 didn't jump to line 1435, because the loop on line 1430 didn't complete

1431 if lookup in allComponents: 1431 ↛ 1430line 1431 didn't jump to line 1430, because the condition on line 1431 was never false

1432 getInfo = allComponents[lookup] 

1433 break 

1434 else: 

1435 raise FileNotFoundError(f"Component {refComponent} not found " 

1436 f"for ref {ref} in datastore {self.name}") 

1437 

1438 # Do not need the component itself if already disassembled 

1439 if isDisassembled: 

1440 isComponent = False 

1441 else: 

1442 isComponent = getInfo.component is not None 

1443 

1444 # For a disassembled component we can validate parametersagainst 

1445 # the component storage class directly 

1446 if isDisassembled: 

1447 refStorageClass.validateParameters(parameters) 

1448 else: 

1449 # For an assembled composite this could be a derived 

1450 # component derived from a real component. The validity 

1451 # of the parameters is not clear. For now validate against 

1452 # the composite storage class 

1453 getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) 

1454 

1455 return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent) 

1456 

1457 @transactional 

1458 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1459 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1460 

1461 Parameters 

1462 ---------- 

1463 inMemoryDataset : `object` 

1464 The dataset to store. 

1465 ref : `DatasetRef` 

1466 Reference to the associated Dataset. 

1467 

1468 Raises 

1469 ------ 

1470 TypeError 

1471 Supplied object and storage class are inconsistent. 

1472 DatasetTypeNotSupportedError 

1473 The associated `DatasetType` is not handled by this datastore. 

1474 

1475 Notes 

1476 ----- 

1477 If the datastore is configured to reject certain dataset types it 

1478 is possible that the put will fail and raise a 

1479 `DatasetTypeNotSupportedError`. The main use case for this is to 

1480 allow `ChainedDatastore` to put to multiple datastores without 

1481 requiring that every datastore accepts the dataset. 

1482 """ 

1483 

1484 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1485 # doDisassembly = True 

1486 

1487 artifacts = [] 

1488 if doDisassembly: 

1489 components = ref.datasetType.storageClass.delegate().disassemble(inMemoryDataset) 

1490 for component, componentInfo in components.items(): 

1491 # Don't recurse because we want to take advantage of 

1492 # bulk insert -- need a new DatasetRef that refers to the 

1493 # same dataset_id but has the component DatasetType 

1494 # DatasetType does not refer to the types of components 

1495 # So we construct one ourselves. 

1496 compRef = ref.makeComponentRef(component) 

1497 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1498 artifacts.append((compRef, storedInfo)) 

1499 else: 

1500 # Write the entire thing out 

1501 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1502 artifacts.append((ref, storedInfo)) 

1503 

1504 self._register_datasets(artifacts) 

1505 

1506 @transactional 

1507 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1508 """Indicate to the datastore that a dataset can be removed. 

1509 

1510 Parameters 

1511 ---------- 

1512 ref : `DatasetRef` 

1513 Reference to the required Dataset. 

1514 ignore_errors : `bool` 

1515 If `True` return without error even if something went wrong. 

1516 Problems could occur if another process is simultaneously trying 

1517 to delete. 

1518 

1519 Raises 

1520 ------ 

1521 FileNotFoundError 

1522 Attempt to remove a dataset that does not exist. 

1523 """ 

1524 # Get file metadata and internal metadata 

1525 log.debug("Trashing %s in datastore %s", ref, self.name) 

1526 

1527 fileLocations = self._get_dataset_locations_info(ref) 

1528 

1529 if not fileLocations: 

1530 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1531 if ignore_errors: 

1532 log.warning(err_msg) 

1533 return 

1534 else: 

1535 raise FileNotFoundError(err_msg) 

1536 

1537 for location, storedFileInfo in fileLocations: 

1538 if not self._artifact_exists(location): 1538 ↛ 1539line 1538 didn't jump to line 1539, because the condition on line 1538 was never true

1539 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1540 f"associated artifact ({location.uri}) is missing" 

1541 if ignore_errors: 

1542 log.warning(err_msg) 

1543 return 

1544 else: 

1545 raise FileNotFoundError(err_msg) 

1546 

1547 # Mark dataset as trashed 

1548 try: 

1549 self._move_to_trash_in_registry(ref) 

1550 except Exception as e: 

1551 if ignore_errors: 

1552 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1553 f"but encountered an error: {e}") 

1554 pass 

1555 else: 

1556 raise 

1557 

1558 @transactional 

1559 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1560 """Remove all datasets from the trash. 

1561 

1562 Parameters 

1563 ---------- 

1564 ignore_errors : `bool` 

1565 If `True` return without error even if something went wrong. 

1566 Problems could occur if another process is simultaneously trying 

1567 to delete. 

1568 """ 

1569 log.debug("Emptying trash in datastore %s", self.name) 

1570 # Context manager will empty trash iff we finish it without raising. 

1571 with self.bridge.emptyTrash() as trashed: 

1572 for ref in trashed: 

1573 fileLocations = self._get_dataset_locations_info(ref) 

1574 

1575 if not fileLocations: 1575 ↛ 1576line 1575 didn't jump to line 1576, because the condition on line 1575 was never true

1576 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1577 if ignore_errors: 

1578 log.warning(err_msg) 

1579 continue 

1580 else: 

1581 raise FileNotFoundError(err_msg) 

1582 

1583 for location, _ in fileLocations: 

1584 

1585 if not self._artifact_exists(location): 1585 ↛ 1586line 1585 didn't jump to line 1586, because the condition on line 1585 was never true

1586 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1587 if ignore_errors: 

1588 log.warning(err_msg) 

1589 continue 

1590 else: 

1591 raise FileNotFoundError(err_msg) 

1592 

1593 # Can only delete the artifact if there are no references 

1594 # to the file from untrashed dataset refs. 

1595 if self._can_remove_dataset_artifact(ref, location): 

1596 # Point of no return for this artifact 

1597 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1598 try: 

1599 self._delete_artifact(location) 

1600 except Exception as e: 

1601 if ignore_errors: 

1602 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1603 location.uri, self.name, e) 

1604 else: 

1605 raise 

1606 

1607 # Now must remove the entry from the internal registry even if 

1608 # the artifact removal failed and was ignored, 

1609 # otherwise the removal check above will never be true 

1610 try: 

1611 # There may be multiple rows associated with this ref 

1612 # depending on disassembly 

1613 self.removeStoredItemInfo(ref) 

1614 except Exception as e: 

1615 if ignore_errors: 

1616 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1617 ref.id, location.uri, self.name, e) 

1618 continue 

1619 else: 

1620 raise FileNotFoundError( 

1621 f"Error removing dataset {ref.id} ({location.uri}) from internal registry " 

1622 f"of {self.name}" 

1623 ) from e 

1624 

1625 @transactional 

1626 def forget(self, refs: Iterable[DatasetRef]) -> None: 

1627 # Docstring inherited. 

1628 refs = list(refs) 

1629 self.bridge.forget(refs) 

1630 self._table.delete(["dataset_id"], *[{"dataset_id": ref.getCheckedId()} for ref in refs]) 

1631 

1632 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1633 logFailures: bool = False) -> None: 

1634 """Validate some of the configuration for this datastore. 

1635 

1636 Parameters 

1637 ---------- 

1638 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1639 Entities to test against this configuration. Can be differing 

1640 types. 

1641 logFailures : `bool`, optional 

1642 If `True`, output a log message for every validation error 

1643 detected. 

1644 

1645 Raises 

1646 ------ 

1647 DatastoreValidationError 

1648 Raised if there is a validation problem with a configuration. 

1649 All the problems are reported in a single exception. 

1650 

1651 Notes 

1652 ----- 

1653 This method checks that all the supplied entities have valid file 

1654 templates and also have formatters defined. 

1655 """ 

1656 

1657 templateFailed = None 

1658 try: 

1659 self.templates.validateTemplates(entities, logFailures=logFailures) 

1660 except FileTemplateValidationError as e: 

1661 templateFailed = str(e) 

1662 

1663 formatterFailed = [] 

1664 for entity in entities: 

1665 try: 

1666 self.formatterFactory.getFormatterClass(entity) 

1667 except KeyError as e: 

1668 formatterFailed.append(str(e)) 

1669 if logFailures: 1669 ↛ 1664line 1669 didn't jump to line 1664, because the condition on line 1669 was never false

1670 log.critical("Formatter failure: %s", e) 

1671 

1672 if templateFailed or formatterFailed: 

1673 messages = [] 

1674 if templateFailed: 1674 ↛ 1675line 1674 didn't jump to line 1675, because the condition on line 1674 was never true

1675 messages.append(templateFailed) 

1676 if formatterFailed: 1676 ↛ 1678line 1676 didn't jump to line 1678, because the condition on line 1676 was never false

1677 messages.append(",".join(formatterFailed)) 

1678 msg = ";\n".join(messages) 

1679 raise DatastoreValidationError(msg) 

1680 

1681 def getLookupKeys(self) -> Set[LookupKey]: 

1682 # Docstring is inherited from base class 

1683 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1684 self.constraints.getLookupKeys() 

1685 

1686 def validateKey(self, lookupKey: LookupKey, 

1687 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1688 # Docstring is inherited from base class 

1689 # The key can be valid in either formatters or templates so we can 

1690 # only check the template if it exists 

1691 if lookupKey in self.templates: 

1692 try: 

1693 self.templates[lookupKey].validateTemplate(entity) 

1694 except FileTemplateValidationError as e: 

1695 raise DatastoreValidationError(e) from e 

1696 

1697 def export(self, refs: Iterable[DatasetRef], *, 

1698 directory: Optional[Union[ButlerURI, str]] = None, 

1699 transfer: Optional[str] = "auto") -> Iterable[FileDataset]: 

1700 # Docstring inherited from Datastore.export. 

1701 if transfer is not None and directory is None: 1701 ↛ 1702line 1701 didn't jump to line 1702, because the condition on line 1701 was never true

1702 raise RuntimeError(f"Cannot export using transfer mode {transfer} with no " 

1703 "export directory given") 

1704 

1705 # Force the directory to be a URI object 

1706 directoryUri: Optional[ButlerURI] = None 

1707 if directory is not None: 1707 ↛ 1710line 1707 didn't jump to line 1710, because the condition on line 1707 was never false

1708 directoryUri = ButlerURI(directory, forceDirectory=True) 

1709 

1710 if transfer is not None and directoryUri is not None: 1710 ↛ 1715line 1710 didn't jump to line 1715, because the condition on line 1710 was never false

1711 # mypy needs the second test 

1712 if not directoryUri.exists(): 1712 ↛ 1713line 1712 didn't jump to line 1713, because the condition on line 1712 was never true

1713 raise FileNotFoundError(f"Export location {directory} does not exist") 

1714 

1715 progress = Progress("lsst.daf.butler.datastores.FileDatastore.export", level=logging.DEBUG) 

1716 for ref in progress.wrap(refs, "Exporting dataset files"): 

1717 fileLocations = self._get_dataset_locations_info(ref) 

1718 if not fileLocations: 1718 ↛ 1719line 1718 didn't jump to line 1719, because the condition on line 1718 was never true

1719 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

1720 # For now we can not export disassembled datasets 

1721 if len(fileLocations) > 1: 

1722 raise NotImplementedError(f"Can not export disassembled datasets such as {ref}") 

1723 location, storedFileInfo = fileLocations[0] 

1724 

1725 pathInStore = location.pathInStore.path 

1726 if transfer is None: 1726 ↛ 1729line 1726 didn't jump to line 1729, because the condition on line 1726 was never true

1727 # TODO: do we also need to return the readStorageClass somehow? 

1728 # We will use the path in store directly 

1729 pass 

1730 elif transfer == "direct": 1730 ↛ 1732line 1730 didn't jump to line 1732, because the condition on line 1730 was never true

1731 # Use full URIs to the remote store in the export 

1732 pathInStore = str(location.uri) 

1733 else: 

1734 # mypy needs help 

1735 assert directoryUri is not None, "directoryUri must be defined to get here" 

1736 storeUri = ButlerURI(location.uri) 

1737 

1738 # if the datastore has an absolute URI to a resource, we 

1739 # have two options: 

1740 # 1. Keep the absolute URI in the exported YAML 

1741 # 2. Allocate a new name in the local datastore and transfer 

1742 # it. 

1743 # For now go with option 2 

1744 if location.pathInStore.isabs(): 1744 ↛ 1745line 1744 didn't jump to line 1745, because the condition on line 1744 was never true

1745 template = self.templates.getTemplate(ref) 

1746 newURI = ButlerURI(template.format(ref), forceAbsolute=False) 

1747 pathInStore = str(newURI.updatedExtension(location.pathInStore.getExtension())) 

1748 

1749 exportUri = directoryUri.join(pathInStore) 

1750 exportUri.transfer_from(storeUri, transfer=transfer) 

1751 

1752 yield FileDataset(refs=[ref], path=pathInStore, formatter=storedFileInfo.formatter) 

1753 

1754 @staticmethod 

1755 def computeChecksum(uri: ButlerURI, algorithm: str = "blake2b", block_size: int = 8192) -> Optional[str]: 

1756 """Compute the checksum of the supplied file. 

1757 

1758 Parameters 

1759 ---------- 

1760 uri : `ButlerURI` 

1761 Name of resource to calculate checksum from. 

1762 algorithm : `str`, optional 

1763 Name of algorithm to use. Must be one of the algorithms supported 

1764 by :py:class`hashlib`. 

1765 block_size : `int` 

1766 Number of bytes to read from file at one time. 

1767 

1768 Returns 

1769 ------- 

1770 hexdigest : `str` 

1771 Hex digest of the file. 

1772 

1773 Notes 

1774 ----- 

1775 Currently returns None if the URI is for a remote resource. 

1776 """ 

1777 if algorithm not in hashlib.algorithms_guaranteed: 1777 ↛ 1778line 1777 didn't jump to line 1778, because the condition on line 1777 was never true

1778 raise NameError("The specified algorithm '{}' is not supported by hashlib".format(algorithm)) 

1779 

1780 if not uri.isLocal: 1780 ↛ 1781line 1780 didn't jump to line 1781, because the condition on line 1780 was never true

1781 return None 

1782 

1783 hasher = hashlib.new(algorithm) 

1784 

1785 with uri.as_local() as local_uri: 

1786 with open(local_uri.ospath, "rb") as f: 

1787 for chunk in iter(lambda: f.read(block_size), b""): 

1788 hasher.update(chunk) 

1789 

1790 return hasher.hexdigest() 

1791 

1792 def needs_expanded_data_ids( 

1793 self, 

1794 transfer: Optional[str], 

1795 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

1796 ) -> bool: 

1797 # Docstring inherited. 

1798 # This _could_ also use entity to inspect whether the filename template 

1799 # involves placeholders other than the required dimensions for its 

1800 # dataset type, but that's not necessary for correctness; it just 

1801 # enables more optimizations (perhaps only in theory). 

1802 return transfer not in ("direct", None)