Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileLikeDatastore", ) 

26 

27import logging 

28from abc import abstractmethod 

29 

30from sqlalchemy import Integer, String 

31 

32from dataclasses import dataclass 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 ClassVar, 

37 Dict, 

38 Iterable, 

39 List, 

40 Mapping, 

41 Optional, 

42 Set, 

43 Tuple, 

44 Type, 

45 Union, 

46) 

47 

48from lsst.daf.butler import ( 

49 CompositesMap, 

50 Config, 

51 FileDataset, 

52 DatasetRef, 

53 DatasetType, 

54 DatasetTypeNotSupportedError, 

55 Datastore, 

56 DatastoreConfig, 

57 DatastoreValidationError, 

58 FileDescriptor, 

59 FileTemplates, 

60 FileTemplateValidationError, 

61 Formatter, 

62 FormatterFactory, 

63 Location, 

64 LocationFactory, 

65 StorageClass, 

66 StoredFileInfo, 

67) 

68 

69from lsst.daf.butler import ddl 

70from lsst.daf.butler.registry.interfaces import ( 

71 ReadOnlyDatabaseError, 

72 DatastoreRegistryBridge, 

73 FakeDatasetRef, 

74) 

75 

76from lsst.daf.butler.core.repoRelocation import replaceRoot 

77from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional 

78from .genericDatastore import GenericBaseDatastore 

79 

80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true

81 from lsst.daf.butler import LookupKey 

82 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

83 

84log = logging.getLogger(__name__) 

85 

86# String to use when a Python None is encountered 

87NULLSTR = "__NULL_STRING__" 

88 

89 

90class _IngestPrepData(Datastore.IngestPrepData): 

91 """Helper class for FileLikeDatastore ingest implementation. 

92 

93 Parameters 

94 ---------- 

95 datasets : `list` of `FileDataset` 

96 Files to be ingested by this datastore. 

97 """ 

98 def __init__(self, datasets: List[FileDataset]): 

99 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

100 self.datasets = datasets 

101 

102 

103@dataclass(frozen=True) 

104class DatastoreFileGetInformation: 

105 """Collection of useful parameters needed to retrieve a file from 

106 a Datastore. 

107 """ 

108 

109 location: Location 

110 """The location from which to read the dataset.""" 

111 

112 formatter: Formatter 

113 """The `Formatter` to use to deserialize the dataset.""" 

114 

115 info: StoredFileInfo 

116 """Stored information about this file and its formatter.""" 

117 

118 assemblerParams: dict 

119 """Parameters to use for post-processing the retrieved dataset.""" 

120 

121 component: Optional[str] 

122 """The component to be retrieved (can be `None`).""" 

123 

124 readStorageClass: StorageClass 

125 """The `StorageClass` of the dataset being read.""" 

126 

127 

128class FileLikeDatastore(GenericBaseDatastore): 

129 """Generic Datastore for file-based implementations. 

130 

131 Should always be sub-classed since key abstract methods are missing. 

132 

133 Parameters 

134 ---------- 

135 config : `DatastoreConfig` or `str` 

136 Configuration as either a `Config` object or URI to file. 

137 bridgeManager : `DatastoreRegistryBridgeManager` 

138 Object that manages the interface between `Registry` and datastores. 

139 butlerRoot : `str`, optional 

140 New datastore root to use to override the configuration value. 

141 

142 Raises 

143 ------ 

144 ValueError 

145 If root location does not exist and ``create`` is `False` in the 

146 configuration. 

147 """ 

148 

149 defaultConfigFile: ClassVar[Optional[str]] = None 

150 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

151 absolute path. Can be None if no defaults specified. 

152 """ 

153 

154 root: str 

155 """Root directory or URI of this `Datastore`.""" 

156 

157 locationFactory: LocationFactory 

158 """Factory for creating locations relative to the datastore root.""" 

159 

160 formatterFactory: FormatterFactory 

161 """Factory for creating instances of formatters.""" 

162 

163 templates: FileTemplates 

164 """File templates that can be used by this `Datastore`.""" 

165 

166 composites: CompositesMap 

167 """Determines whether a dataset should be disassembled on put.""" 

168 

169 @classmethod 

170 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

171 """Set any filesystem-dependent config options for this Datastore to 

172 be appropriate for a new empty repository with the given root. 

173 

174 Parameters 

175 ---------- 

176 root : `str` 

177 URI to the root of the data repository. 

178 config : `Config` 

179 A `Config` to update. Only the subset understood by 

180 this component will be updated. Will not expand 

181 defaults. 

182 full : `Config` 

183 A complete config with all defaults expanded that can be 

184 converted to a `DatastoreConfig`. Read-only and will not be 

185 modified by this method. 

186 Repository-specific options that should not be obtained 

187 from defaults when Butler instances are constructed 

188 should be copied from ``full`` to ``config``. 

189 overwrite : `bool`, optional 

190 If `False`, do not modify a value in ``config`` if the value 

191 already exists. Default is always to overwrite with the provided 

192 ``root``. 

193 

194 Notes 

195 ----- 

196 If a keyword is explicitly defined in the supplied ``config`` it 

197 will not be overridden by this method if ``overwrite`` is `False`. 

198 This allows explicit values set in external configs to be retained. 

199 """ 

200 Config.updateParameters(DatastoreConfig, config, full, 

201 toUpdate={"root": root}, 

202 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

203 

204 @classmethod 

205 def makeTableSpec(cls) -> ddl.TableSpec: 

206 return ddl.TableSpec( 

207 fields=NamedValueSet([ 

208 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True), 

209 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

210 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

211 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

212 # Use empty string to indicate no component 

213 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

214 # TODO: should checksum be Base64Bytes instead? 

215 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

216 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True), 

217 ]), 

218 unique=frozenset(), # type: ignore 

219 ) 

220 

221 def __init__(self, config: Union[DatastoreConfig, str], 

222 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

223 super().__init__(config, bridgeManager) 

224 if "root" not in self.config: 224 ↛ 225line 224 didn't jump to line 225, because the condition on line 224 was never true

225 raise ValueError("No root directory specified in configuration") 

226 

227 # Name ourselves either using an explicit name or a name 

228 # derived from the (unexpanded) root 

229 if "name" in self.config: 

230 self.name = self.config["name"] 

231 else: 

232 # We use the unexpanded root in the name to indicate that this 

233 # datastore can be moved without having to update registry. 

234 self.name = "{}@{}".format(type(self).__name__, 

235 self.config["root"]) 

236 

237 # Support repository relocation in config 

238 # Existence of self.root is checked in subclass 

239 self.root = replaceRoot(self.config["root"], butlerRoot) 

240 

241 self.locationFactory = LocationFactory(self.root) 

242 self.formatterFactory = FormatterFactory() 

243 

244 # Now associate formatters with storage classes 

245 self.formatterFactory.registerFormatters(self.config["formatters"], 

246 universe=bridgeManager.universe) 

247 

248 # Read the file naming templates 

249 self.templates = FileTemplates(self.config["templates"], 

250 universe=bridgeManager.universe) 

251 

252 # See if composites should be disassembled 

253 self.composites = CompositesMap(self.config["composites"], 

254 universe=bridgeManager.universe) 

255 

256 tableName = self.config["records", "table"] 

257 try: 

258 # Storage of paths and formatters, keyed by dataset_id 

259 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

260 # Interface to Registry. 

261 self._bridge = bridgeManager.register(self.name) 

262 except ReadOnlyDatabaseError: 

263 # If the database is read only and we just tried and failed to 

264 # create a table, it means someone is trying to create a read-only 

265 # butler client for an empty repo. That should be okay, as long 

266 # as they then try to get any datasets before some other client 

267 # creates the table. Chances are they'rejust validating 

268 # configuration. 

269 pass 

270 

271 # Determine whether checksums should be used 

272 self.useChecksum = self.config.get("checksum", True) 

273 

274 def __str__(self) -> str: 

275 return self.root 

276 

277 @property 

278 def bridge(self) -> DatastoreRegistryBridge: 

279 return self._bridge 

280 

281 @abstractmethod 

282 def _artifact_exists(self, location: Location) -> bool: 

283 """Check that an artifact exists in this datastore at the specified 

284 location. 

285 

286 Parameters 

287 ---------- 

288 location : `Location` 

289 Expected location of the artifact associated with this datastore. 

290 

291 Returns 

292 ------- 

293 exists : `bool` 

294 True if the location can be found, false otherwise. 

295 """ 

296 raise NotImplementedError() 

297 

298 @abstractmethod 

299 def _delete_artifact(self, location: Location) -> None: 

300 """Delete the artifact from the datastore. 

301 

302 Parameters 

303 ---------- 

304 location : `Location` 

305 Location of the artifact associated with this datastore. 

306 """ 

307 raise NotImplementedError() 

308 

309 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

310 # Docstring inherited from GenericBaseDatastore 

311 records = [] 

312 for ref, info in zip(refs, infos): 

313 # Component should come from ref and fall back on info 

314 component = ref.datasetType.component() 

315 if component is None and info.component is not None: 315 ↛ 316line 315 didn't jump to line 316, because the condition on line 315 was never true

316 component = info.component 

317 if component is None: 

318 # Use empty string since we want this to be part of the 

319 # primary key. 

320 component = NULLSTR 

321 records.append( 

322 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

323 storage_class=info.storageClass.name, component=component, 

324 checksum=info.checksum, file_size=info.file_size) 

325 ) 

326 self._table.insert(*records) 

327 

328 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredFileInfo: 

329 # Docstring inherited from GenericBaseDatastore 

330 

331 if ref.id is None: 331 ↛ 332line 331 didn't jump to line 332, because the condition on line 331 was never true

332 raise RuntimeError("Unable to retrieve information for unresolved DatasetRef") 

333 

334 where: Dict[str, Union[int, str]] = {"dataset_id": ref.id} 

335 

336 # If we have no component we want the row from this table without 

337 # a component. If we do have a component we either need the row 

338 # with no component or the row with the component, depending on how 

339 # this dataset was dissassembled. 

340 

341 # if we are emptying trash we won't have real refs so can't constrain 

342 # by component. Will need to fix this to return multiple matches 

343 # in future. 

344 component = None 

345 try: 

346 component = ref.datasetType.component() 

347 except AttributeError: 

348 pass 

349 else: 

350 if component is None: 

351 where["component"] = NULLSTR 

352 

353 # Look for the dataset_id -- there might be multiple matches 

354 # if we have disassembled the dataset. 

355 records = list(self._table.fetch(**where)) 

356 if len(records) == 0: 356 ↛ 357line 356 didn't jump to line 357, because the condition on line 356 was never true

357 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.") 

358 

359 # if we are not asking for a component 

360 if not component and len(records) != 1: 360 ↛ 361line 360 didn't jump to line 361, because the condition on line 360 was never true

361 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}") 

362 

363 # if we had a FakeDatasetRef we pick the first record regardless 

364 if isinstance(ref, FakeDatasetRef): 364 ↛ 365line 364 didn't jump to line 365, because the condition on line 364 was never true

365 record = records[0] 

366 else: 

367 records_by_component = {} 

368 for r in records: 

369 this_component = r["component"] if r["component"] and r["component"] != NULLSTR else None 

370 records_by_component[this_component] = r 

371 

372 # Look for component by name else fall back to the parent 

373 for lookup in (component, None): 373 ↛ 378line 373 didn't jump to line 378, because the loop on line 373 didn't complete

374 if lookup in records_by_component: 374 ↛ 373line 374 didn't jump to line 373, because the condition on line 374 was never false

375 record = records_by_component[lookup] 

376 break 

377 else: 

378 raise KeyError(f"Unable to retrieve location for component {component} associated with " 

379 f"dataset {ref}.") 

380 

381 # Convert name of StorageClass to instance 

382 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

383 

384 return StoredFileInfo(formatter=record["formatter"], 

385 path=record["path"], 

386 storageClass=storageClass, 

387 component=component, 

388 checksum=record["checksum"], 

389 file_size=record["file_size"]) 

390 

391 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

392 # Docstring inherited from GenericBaseDatastore 

393 

394 # Look for the dataset_id -- there might be multiple matches 

395 # if we have disassembled the dataset. 

396 records = list(self._table.fetch(dataset_id=ref.id)) 

397 

398 results = [] 

399 for record in records: 

400 # Convert name of StorageClass to instance 

401 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

402 component = record["component"] if (record["component"] 

403 and record["component"] != NULLSTR) else None 

404 

405 info = StoredFileInfo(formatter=record["formatter"], 

406 path=record["path"], 

407 storageClass=storageClass, 

408 component=component, 

409 checksum=record["checksum"], 

410 file_size=record["file_size"]) 

411 results.append(info) 

412 

413 return results 

414 

415 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]: 

416 """Return all dataset refs associated with the supplied path. 

417 

418 Parameters 

419 ---------- 

420 pathInStore : `str` 

421 Path of interest in the data store. 

422 

423 Returns 

424 ------- 

425 ids : `set` of `int` 

426 All `DatasetRef` IDs associated with this path. 

427 """ 

428 records = list(self._table.fetch(path=pathInStore)) 

429 ids = {r["dataset_id"] for r in records} 

430 return ids 

431 

432 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

433 # Docstring inherited from GenericBaseDatastore 

434 self._table.delete(dataset_id=ref.id) 

435 

436 def _get_dataset_location_info(self, 

437 ref: DatasetRef) -> Tuple[Optional[Location], Optional[StoredFileInfo]]: 

438 """Find the `Location` of the requested dataset in the 

439 `Datastore` and the associated stored file information. 

440 

441 Parameters 

442 ---------- 

443 ref : `DatasetRef` 

444 Reference to the required `Dataset`. 

445 

446 Returns 

447 ------- 

448 location : `Location` 

449 Location of the dataset within the datastore. 

450 Returns `None` if the dataset can not be located. 

451 info : `StoredFileInfo` 

452 Stored information about this file and its formatter. 

453 """ 

454 # Get the file information (this will fail if no file) 

455 try: 

456 storedFileInfo = self.getStoredItemInfo(ref) 

457 except KeyError: 

458 return None, None 

459 

460 # Use the path to determine the location 

461 location = self.locationFactory.fromPath(storedFileInfo.path) 

462 

463 return location, storedFileInfo 

464 

465 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

466 r"""Find all the `Location`\ s of the requested dataset in the 

467 `Datastore` and the associated stored file information. 

468 

469 Parameters 

470 ---------- 

471 ref : `DatasetRef` 

472 Reference to the required `Dataset`. 

473 

474 Returns 

475 ------- 

476 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

477 Location of the dataset within the datastore and 

478 stored information about each file and its formatter. 

479 """ 

480 # Get the file information (this will fail if no file) 

481 records = self.getStoredItemsInfo(ref) 

482 

483 # Use the path to determine the location 

484 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

485 

486 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

487 """Check that there is only one dataset associated with the 

488 specified artifact. 

489 

490 Parameters 

491 ---------- 

492 ref : `DatasetRef` or `FakeDatasetRef` 

493 Dataset to be removed. 

494 location : `Location` 

495 The location of the artifact to be removed. 

496 

497 Returns 

498 ------- 

499 can_remove : `Bool` 

500 True if the artifact can be safely removed. 

501 """ 

502 

503 # Get all entries associated with this path 

504 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

505 if not allRefs: 505 ↛ 506line 505 didn't jump to line 506, because the condition on line 505 was never true

506 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

507 

508 # Get all the refs associated with this dataset if it is a composite 

509 theseRefs = {r.id for r in ref.allRefs()} 

510 

511 # Remove these refs from all the refs and if there is nothing left 

512 # then we can delete 

513 remainingRefs = allRefs - theseRefs 

514 

515 if remainingRefs: 

516 return False 

517 return True 

518 

519 def _prepare_for_get(self, ref: DatasetRef, 

520 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

521 """Check parameters for ``get`` and obtain formatter and 

522 location. 

523 

524 Parameters 

525 ---------- 

526 ref : `DatasetRef` 

527 Reference to the required Dataset. 

528 parameters : `dict` 

529 `StorageClass`-specific parameters that specify, for example, 

530 a slice of the dataset to be loaded. 

531 

532 Returns 

533 ------- 

534 getInfo : `list` [`DatastoreFileGetInformation`] 

535 Parameters needed to retrieve each file. 

536 """ 

537 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

538 

539 # Get file metadata and internal metadata 

540 fileLocations = self._get_dataset_locations_info(ref) 

541 if not fileLocations: 

542 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

543 

544 # The storage class we want to use eventually 

545 refStorageClass = ref.datasetType.storageClass 

546 

547 # Check that the supplied parameters are suitable for the type read 

548 refStorageClass.validateParameters(parameters) 

549 

550 if len(fileLocations) > 1: 

551 disassembled = True 

552 else: 

553 disassembled = False 

554 

555 # Is this a component request? 

556 refComponent = ref.datasetType.component() 

557 

558 fileGetInfo = [] 

559 for location, storedFileInfo in fileLocations: 

560 

561 # The storage class used to write the file 

562 writeStorageClass = storedFileInfo.storageClass 

563 

564 # If this has been disassembled we need read to match the write 

565 if disassembled: 

566 readStorageClass = writeStorageClass 

567 else: 

568 readStorageClass = refStorageClass 

569 

570 formatter = getInstanceOf(storedFileInfo.formatter, 

571 FileDescriptor(location, readStorageClass=readStorageClass, 

572 storageClass=writeStorageClass, parameters=parameters), 

573 ref.dataId) 

574 

575 _, notFormatterParams = formatter.segregateParameters() 

576 

577 # Of the remaining parameters, extract the ones supported by 

578 # this StorageClass (for components not all will be handled) 

579 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

580 

581 # The ref itself could be a component if the dataset was 

582 # disassembled by butler, or we disassembled in datastore and 

583 # components came from the datastore records 

584 component = storedFileInfo.component if storedFileInfo.component else refComponent 

585 

586 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

587 assemblerParams, component, readStorageClass)) 

588 

589 return fileGetInfo 

590 

591 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

592 """Check the arguments for ``put`` and obtain formatter and 

593 location. 

594 

595 Parameters 

596 ---------- 

597 inMemoryDataset : `object` 

598 The dataset to store. 

599 ref : `DatasetRef` 

600 Reference to the associated Dataset. 

601 

602 Returns 

603 ------- 

604 location : `Location` 

605 The location to write the dataset. 

606 formatter : `Formatter` 

607 The `Formatter` to use to write the dataset. 

608 

609 Raises 

610 ------ 

611 TypeError 

612 Supplied object and storage class are inconsistent. 

613 DatasetTypeNotSupportedError 

614 The associated `DatasetType` is not handled by this datastore. 

615 """ 

616 self._validate_put_parameters(inMemoryDataset, ref) 

617 

618 # Work out output file name 

619 try: 

620 template = self.templates.getTemplate(ref) 

621 except KeyError as e: 

622 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

623 

624 location = self.locationFactory.fromPath(template.format(ref)) 

625 

626 # Get the formatter based on the storage class 

627 storageClass = ref.datasetType.storageClass 

628 try: 

629 formatter = self.formatterFactory.getFormatter(ref, 

630 FileDescriptor(location, 

631 storageClass=storageClass), 

632 ref.dataId) 

633 except KeyError as e: 

634 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e 

635 

636 return location, formatter 

637 

638 @abstractmethod 

639 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

640 """Standardize the path of a to-be-ingested file. 

641 

642 Parameters 

643 ---------- 

644 path : `str` 

645 Path of a file to be ingested. 

646 transfer : `str`, optional 

647 How (and whether) the dataset should be added to the datastore. 

648 See `ingest` for details of transfer modes. 

649 This implementation is provided only so 

650 `NotImplementedError` can be raised if the mode is not supported; 

651 actual transfers are deferred to `_extractIngestInfo`. 

652 

653 Returns 

654 ------- 

655 path : `str` 

656 New path in what the datastore considers standard form. 

657 

658 Notes 

659 ----- 

660 Subclasses of `FileLikeDatastore` should implement this method instead 

661 of `_prepIngest`. It should not modify the data repository or given 

662 file in any way. 

663 

664 Raises 

665 ------ 

666 NotImplementedError 

667 Raised if the datastore does not support the given transfer mode 

668 (including the case where ingest is not supported at all). 

669 FileNotFoundError 

670 Raised if one of the given files does not exist. 

671 """ 

672 raise NotImplementedError("Must be implemented by subclasses.") 

673 

674 @abstractmethod 

675 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

676 formatter: Union[Formatter, Type[Formatter]], 

677 transfer: Optional[str] = None) -> StoredFileInfo: 

678 """Relocate (if necessary) and extract `StoredFileInfo` from a 

679 to-be-ingested file. 

680 

681 Parameters 

682 ---------- 

683 path : `str` 

684 Path of a file to be ingested. 

685 ref : `DatasetRef` 

686 Reference for the dataset being ingested. Guaranteed to have 

687 ``dataset_id not None`. 

688 formatter : `type` or `Formatter` 

689 `Formatter` subclass to use for this dataset or an instance. 

690 transfer : `str`, optional 

691 How (and whether) the dataset should be added to the datastore. 

692 See `ingest` for details of transfer modes. 

693 

694 Returns 

695 ------- 

696 info : `StoredFileInfo` 

697 Internal datastore record for this file. This will be inserted by 

698 the caller; the `_extractIngestInfo` is only resposible for 

699 creating and populating the struct. 

700 

701 Raises 

702 ------ 

703 FileNotFoundError 

704 Raised if one of the given files does not exist. 

705 FileExistsError 

706 Raised if transfer is not `None` but the (internal) location the 

707 file would be moved to is already occupied. 

708 """ 

709 raise NotImplementedError("Must be implemented by subclasses.") 

710 

711 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

712 # Docstring inherited from Datastore._prepIngest. 

713 filtered = [] 

714 for dataset in datasets: 

715 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

716 if not acceptable: 

717 continue 

718 else: 

719 dataset.refs = acceptable 

720 if dataset.formatter is None: 

721 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

722 else: 

723 dataset.formatter = getClassOf(dataset.formatter) 

724 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

725 filtered.append(dataset) 

726 return _IngestPrepData(filtered) 

727 

728 @transactional 

729 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

730 # Docstring inherited from Datastore._finishIngest. 

731 refsAndInfos = [] 

732 for dataset in prepData.datasets: 

733 # Do ingest as if the first dataset ref is associated with the file 

734 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

735 transfer=transfer) 

736 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

737 self._register_datasets(refsAndInfos) 

738 

739 @abstractmethod 

740 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

741 """Write out in memory dataset to datastore. 

742 

743 Parameters 

744 ---------- 

745 inMemoryDataset : `object` 

746 Dataset to write to datastore. 

747 ref : `DatasetRef` 

748 Registry information associated with this dataset. 

749 

750 Returns 

751 ------- 

752 info : `StoredFileInfo` 

753 Information describin the artifact written to the datastore. 

754 """ 

755 raise NotImplementedError() 

756 

757 @abstractmethod 

758 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

759 ref: DatasetRef, isComponent: bool = False) -> Any: 

760 """Read the artifact from datastore into in memory object. 

761 

762 Parameters 

763 ---------- 

764 getInfo : `DatastoreFileGetInformation` 

765 Information about the artifact within the datastore. 

766 ref : `DatasetRef` 

767 The registry information associated with this artifact. 

768 isComponent : `bool` 

769 Flag to indicate if a component is being read from this artifact. 

770 

771 Returns 

772 ------- 

773 inMemoryDataset : `object` 

774 The artifact as a python object. 

775 """ 

776 raise NotImplementedError() 

777 

778 def exists(self, ref: DatasetRef) -> bool: 

779 """Check if the dataset exists in the datastore. 

780 

781 Parameters 

782 ---------- 

783 ref : `DatasetRef` 

784 Reference to the required dataset. 

785 

786 Returns 

787 ------- 

788 exists : `bool` 

789 `True` if the entity exists in the `Datastore`. 

790 """ 

791 fileLocations = self._get_dataset_locations_info(ref) 

792 if not fileLocations: 

793 return False 

794 for location, _ in fileLocations: 

795 if not self._artifact_exists(location): 

796 return False 

797 

798 return True 

799 

800 def getUri(self, ref: DatasetRef, predict: bool = False) -> str: 

801 """URI to the Dataset. 

802 

803 Parameters 

804 ---------- 

805 ref : `DatasetRef` 

806 Reference to the required Dataset. 

807 predict : `bool` 

808 If `True`, allow URIs to be returned of datasets that have not 

809 been written. 

810 

811 Returns 

812 ------- 

813 uri : `str` 

814 URI string pointing to the dataset within the datastore. If the 

815 dataset does not exist in the datastore, and if ``predict`` is 

816 `True`, the URI will be a prediction and will include a URI 

817 fragment "#predicted". 

818 If the datastore does not have entities that relate well 

819 to the concept of a URI the returned URI string will be 

820 descriptive. The returned URI is not guaranteed to be obtainable. 

821 

822 Raises 

823 ------ 

824 FileNotFoundError 

825 A URI has been requested for a dataset that does not exist and 

826 guessing is not allowed. 

827 

828 Notes 

829 ----- 

830 When a predicted URI is requested an attempt will be made to form 

831 a reasonable URI based on file templates and the expected formatter. 

832 """ 

833 # if this has never been written then we have to guess 

834 if not self.exists(ref): 

835 if not predict: 

836 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

837 

838 template = self.templates.getTemplate(ref) 

839 location = self.locationFactory.fromPath(template.format(ref)) 

840 storageClass = ref.datasetType.storageClass 

841 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location, 

842 storageClass=storageClass)) 

843 # Try to use the extension attribute but ignore problems if the 

844 # formatter does not define one. 

845 try: 

846 location = formatter.makeUpdatedLocation(location) 

847 except Exception: 

848 # Use the default extension 

849 pass 

850 

851 # Add a URI fragment to indicate this is a guess 

852 return location.uri + "#predicted" 

853 

854 # If this is a ref that we have written we can get the path. 

855 # Get file metadata and internal metadata 

856 storedFileInfo = self.getStoredItemInfo(ref) 

857 

858 # Use the path to determine the location 

859 location = self.locationFactory.fromPath(storedFileInfo.path) 

860 

861 return location.uri 

862 

863 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

864 """Load an InMemoryDataset from the store. 

865 

866 Parameters 

867 ---------- 

868 ref : `DatasetRef` 

869 Reference to the required Dataset. 

870 parameters : `dict` 

871 `StorageClass`-specific parameters that specify, for example, 

872 a slice of the dataset to be loaded. 

873 

874 Returns 

875 ------- 

876 inMemoryDataset : `object` 

877 Requested dataset or slice thereof as an InMemoryDataset. 

878 

879 Raises 

880 ------ 

881 FileNotFoundError 

882 Requested dataset can not be retrieved. 

883 TypeError 

884 Return value from formatter has unexpected type. 

885 ValueError 

886 Formatter failed to process the dataset. 

887 """ 

888 allGetInfo = self._prepare_for_get(ref, parameters) 

889 refComponent = ref.datasetType.component() 

890 

891 if len(allGetInfo) > 1 and not refComponent: 

892 # This was a disassembled dataset spread over multiple files 

893 # and we need to put them all back together again. 

894 # Read into memory and then assemble 

895 usedParams = set() 

896 components = {} 

897 for getInfo in allGetInfo: 

898 # assemblerParams are parameters not understood by the 

899 # associated formatter. 

900 usedParams.update(set(getInfo.assemblerParams)) 

901 

902 component = getInfo.component 

903 # We do not want the formatter to think it's reading 

904 # a component though because it is really reading a 

905 # standalone dataset -- always tell reader it is not a 

906 # component. 

907 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

908 

909 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

910 

911 # Any unused parameters will have to be passed to the assembler 

912 if parameters: 

913 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

914 else: 

915 unusedParams = {} 

916 

917 # Process parameters 

918 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

919 parameters=unusedParams) 

920 

921 else: 

922 # Single file request or component from that composite file 

923 allComponents = {i.component: i for i in allGetInfo} 

924 for lookup in (refComponent, None): 924 ↛ 929line 924 didn't jump to line 929, because the loop on line 924 didn't complete

925 if lookup in allComponents: 925 ↛ 924line 925 didn't jump to line 924, because the condition on line 925 was never false

926 getInfo = allComponents[lookup] 

927 break 

928 else: 

929 raise FileNotFoundError(f"Component {refComponent} not found " 

930 f"for ref {ref} in datastore {self.name}") 

931 

932 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None) 

933 

934 @transactional 

935 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

936 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

937 

938 Parameters 

939 ---------- 

940 inMemoryDataset : `object` 

941 The dataset to store. 

942 ref : `DatasetRef` 

943 Reference to the associated Dataset. 

944 

945 Raises 

946 ------ 

947 TypeError 

948 Supplied object and storage class are inconsistent. 

949 DatasetTypeNotSupportedError 

950 The associated `DatasetType` is not handled by this datastore. 

951 

952 Notes 

953 ----- 

954 If the datastore is configured to reject certain dataset types it 

955 is possible that the put will fail and raise a 

956 `DatasetTypeNotSupportedError`. The main use case for this is to 

957 allow `ChainedDatastore` to put to multiple datastores without 

958 requiring that every datastore accepts the dataset. 

959 """ 

960 

961 doDisassembly = self.composites.shouldBeDisassembled(ref) 

962 # doDisassembly = True 

963 

964 artifacts = [] 

965 if doDisassembly: 

966 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset) 

967 for component, componentInfo in components.items(): 

968 compTypeName = ref.datasetType.componentTypeName(component) 

969 # Don't recurse because we want to take advantage of 

970 # bulk insert -- need a new DatasetRef that refers to the 

971 # same dataset_id but has the component DatasetType 

972 # DatasetType does not refer to the types of components 

973 # So we construct one ourselves. 

974 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions, 

975 storageClass=componentInfo.storageClass) 

976 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False) 

977 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

978 artifacts.append((compRef, storedInfo)) 

979 else: 

980 # Write the entire thing out 

981 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

982 artifacts.append((ref, storedInfo)) 

983 

984 self._register_datasets(artifacts) 

985 

986 @transactional 

987 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

988 """Indicate to the datastore that a dataset can be removed. 

989 

990 Parameters 

991 ---------- 

992 ref : `DatasetRef` 

993 Reference to the required Dataset. 

994 ignore_errors : `bool` 

995 If `True` return without error even if something went wrong. 

996 Problems could occur if another process is simultaneously trying 

997 to delete. 

998 

999 Raises 

1000 ------ 

1001 FileNotFoundError 

1002 Attempt to remove a dataset that does not exist. 

1003 """ 

1004 # Get file metadata and internal metadata 

1005 log.debug("Trashing %s in datastore %s", ref, self.name) 

1006 

1007 fileLocations = self._get_dataset_locations_info(ref) 

1008 

1009 if not fileLocations: 

1010 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1011 if ignore_errors: 

1012 log.warning(err_msg) 

1013 return 

1014 else: 

1015 raise FileNotFoundError(err_msg) 

1016 

1017 for location, storedFileInfo in fileLocations: 

1018 if not self._artifact_exists(location): 1018 ↛ 1019line 1018 didn't jump to line 1019, because the condition on line 1018 was never true

1019 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1020 f"associated artifact ({location.uri}) is missing" 

1021 if ignore_errors: 

1022 log.warning(err_msg) 

1023 return 

1024 else: 

1025 raise FileNotFoundError(err_msg) 

1026 

1027 # Mark dataset as trashed 

1028 try: 

1029 self._move_to_trash_in_registry(ref) 

1030 except Exception as e: 

1031 if ignore_errors: 

1032 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1033 f"but encountered an error: {e}") 

1034 pass 

1035 else: 

1036 raise 

1037 

1038 @transactional 

1039 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1040 """Remove all datasets from the trash. 

1041 

1042 Parameters 

1043 ---------- 

1044 ignore_errors : `bool` 

1045 If `True` return without error even if something went wrong. 

1046 Problems could occur if another process is simultaneously trying 

1047 to delete. 

1048 """ 

1049 log.debug("Emptying trash in datastore %s", self.name) 

1050 # Context manager will empty trash iff we finish it without raising. 

1051 with self._bridge.emptyTrash() as trashed: 

1052 for ref in trashed: 

1053 fileLocations = self._get_dataset_locations_info(ref) 

1054 

1055 if not fileLocations: 1055 ↛ 1056line 1055 didn't jump to line 1056, because the condition on line 1055 was never true

1056 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1057 if ignore_errors: 

1058 log.warning(err_msg) 

1059 continue 

1060 else: 

1061 raise FileNotFoundError(err_msg) 

1062 

1063 for location, _ in fileLocations: 

1064 

1065 if not self._artifact_exists(location): 1065 ↛ 1066line 1065 didn't jump to line 1066, because the condition on line 1065 was never true

1066 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1067 if ignore_errors: 

1068 log.warning(err_msg) 

1069 continue 

1070 else: 

1071 raise FileNotFoundError(err_msg) 

1072 

1073 # Can only delete the artifact if there are no references 

1074 # to the file from untrashed dataset refs. 

1075 if self._can_remove_dataset_artifact(ref, location): 

1076 # Point of no return for this artifact 

1077 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1078 try: 

1079 self._delete_artifact(location) 

1080 except Exception as e: 

1081 if ignore_errors: 

1082 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1083 location.uri, self.name, e) 

1084 else: 

1085 raise 

1086 

1087 # Now must remove the entry from the internal registry even if 

1088 # the artifact removal failed and was ignored, 

1089 # otherwise the removal check above will never be true 

1090 try: 

1091 # There may be multiple rows associated with this ref 

1092 # depending on disassembly 

1093 self.removeStoredItemInfo(ref) 

1094 except Exception as e: 

1095 if ignore_errors: 

1096 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1097 ref.id, location.uri, self.name, e) 

1098 continue 

1099 else: 

1100 raise FileNotFoundError(err_msg) 

1101 

1102 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1103 logFailures: bool = False) -> None: 

1104 """Validate some of the configuration for this datastore. 

1105 

1106 Parameters 

1107 ---------- 

1108 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1109 Entities to test against this configuration. Can be differing 

1110 types. 

1111 logFailures : `bool`, optional 

1112 If `True`, output a log message for every validation error 

1113 detected. 

1114 

1115 Raises 

1116 ------ 

1117 DatastoreValidationError 

1118 Raised if there is a validation problem with a configuration. 

1119 All the problems are reported in a single exception. 

1120 

1121 Notes 

1122 ----- 

1123 This method checks that all the supplied entities have valid file 

1124 templates and also have formatters defined. 

1125 """ 

1126 

1127 templateFailed = None 

1128 try: 

1129 self.templates.validateTemplates(entities, logFailures=logFailures) 

1130 except FileTemplateValidationError as e: 

1131 templateFailed = str(e) 

1132 

1133 formatterFailed = [] 

1134 for entity in entities: 

1135 try: 

1136 self.formatterFactory.getFormatterClass(entity) 

1137 except KeyError as e: 

1138 formatterFailed.append(str(e)) 

1139 if logFailures: 1139 ↛ 1134line 1139 didn't jump to line 1134, because the condition on line 1139 was never false

1140 log.fatal("Formatter failure: %s", e) 

1141 

1142 if templateFailed or formatterFailed: 

1143 messages = [] 

1144 if templateFailed: 1144 ↛ 1145line 1144 didn't jump to line 1145, because the condition on line 1144 was never true

1145 messages.append(templateFailed) 

1146 if formatterFailed: 1146 ↛ 1148line 1146 didn't jump to line 1148, because the condition on line 1146 was never false

1147 messages.append(",".join(formatterFailed)) 

1148 msg = ";\n".join(messages) 

1149 raise DatastoreValidationError(msg) 

1150 

1151 def getLookupKeys(self) -> Set[LookupKey]: 

1152 # Docstring is inherited from base class 

1153 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1154 self.constraints.getLookupKeys() 

1155 

1156 def validateKey(self, lookupKey: LookupKey, 

1157 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1158 # Docstring is inherited from base class 

1159 # The key can be valid in either formatters or templates so we can 

1160 # only check the template if it exists 

1161 if lookupKey in self.templates: 

1162 try: 

1163 self.templates[lookupKey].validateTemplate(entity) 

1164 except FileTemplateValidationError as e: 

1165 raise DatastoreValidationError(e) from e