Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileLikeDatastore", ) 

26 

27import logging 

28from abc import abstractmethod 

29 

30from sqlalchemy import BigInteger, String 

31 

32from dataclasses import dataclass 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 ClassVar, 

37 Dict, 

38 Iterable, 

39 List, 

40 Mapping, 

41 Optional, 

42 Set, 

43 Tuple, 

44 Type, 

45 Union, 

46) 

47 

48from lsst.daf.butler import ( 

49 ButlerURI, 

50 CompositesMap, 

51 Config, 

52 FileDataset, 

53 DatasetRef, 

54 DatasetType, 

55 DatasetTypeNotSupportedError, 

56 Datastore, 

57 DatastoreConfig, 

58 DatastoreValidationError, 

59 FileDescriptor, 

60 FileTemplates, 

61 FileTemplateValidationError, 

62 Formatter, 

63 FormatterFactory, 

64 Location, 

65 LocationFactory, 

66 StorageClass, 

67 StoredFileInfo, 

68) 

69 

70from lsst.daf.butler import ddl 

71from lsst.daf.butler.registry.interfaces import ( 

72 ReadOnlyDatabaseError, 

73 DatastoreRegistryBridge, 

74 FakeDatasetRef, 

75) 

76 

77from lsst.daf.butler.core.repoRelocation import replaceRoot 

78from lsst.daf.butler.core.utils import getInstanceOf, getClassOf, transactional 

79from .genericDatastore import GenericBaseDatastore 

80 

81if TYPE_CHECKING: 81 ↛ 82line 81 didn't jump to line 82, because the condition on line 81 was never true

82 from lsst.daf.butler import LookupKey 

83 from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

84 

85log = logging.getLogger(__name__) 

86 

87# String to use when a Python None is encountered 

88NULLSTR = "__NULL_STRING__" 

89 

90 

91class _IngestPrepData(Datastore.IngestPrepData): 

92 """Helper class for FileLikeDatastore ingest implementation. 

93 

94 Parameters 

95 ---------- 

96 datasets : `list` of `FileDataset` 

97 Files to be ingested by this datastore. 

98 """ 

99 def __init__(self, datasets: List[FileDataset]): 

100 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

101 self.datasets = datasets 

102 

103 

104@dataclass(frozen=True) 

105class DatastoreFileGetInformation: 

106 """Collection of useful parameters needed to retrieve a file from 

107 a Datastore. 

108 """ 

109 

110 location: Location 

111 """The location from which to read the dataset.""" 

112 

113 formatter: Formatter 

114 """The `Formatter` to use to deserialize the dataset.""" 

115 

116 info: StoredFileInfo 

117 """Stored information about this file and its formatter.""" 

118 

119 assemblerParams: dict 

120 """Parameters to use for post-processing the retrieved dataset.""" 

121 

122 component: Optional[str] 

123 """The component to be retrieved (can be `None`).""" 

124 

125 readStorageClass: StorageClass 

126 """The `StorageClass` of the dataset being read.""" 

127 

128 

129class FileLikeDatastore(GenericBaseDatastore): 

130 """Generic Datastore for file-based implementations. 

131 

132 Should always be sub-classed since key abstract methods are missing. 

133 

134 Parameters 

135 ---------- 

136 config : `DatastoreConfig` or `str` 

137 Configuration as either a `Config` object or URI to file. 

138 bridgeManager : `DatastoreRegistryBridgeManager` 

139 Object that manages the interface between `Registry` and datastores. 

140 butlerRoot : `str`, optional 

141 New datastore root to use to override the configuration value. 

142 

143 Raises 

144 ------ 

145 ValueError 

146 If root location does not exist and ``create`` is `False` in the 

147 configuration. 

148 """ 

149 

150 defaultConfigFile: ClassVar[Optional[str]] = None 

151 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

152 absolute path. Can be None if no defaults specified. 

153 """ 

154 

155 root: str 

156 """Root directory or URI of this `Datastore`.""" 

157 

158 locationFactory: LocationFactory 

159 """Factory for creating locations relative to the datastore root.""" 

160 

161 formatterFactory: FormatterFactory 

162 """Factory for creating instances of formatters.""" 

163 

164 templates: FileTemplates 

165 """File templates that can be used by this `Datastore`.""" 

166 

167 composites: CompositesMap 

168 """Determines whether a dataset should be disassembled on put.""" 

169 

170 @classmethod 

171 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

172 """Set any filesystem-dependent config options for this Datastore to 

173 be appropriate for a new empty repository with the given root. 

174 

175 Parameters 

176 ---------- 

177 root : `str` 

178 URI to the root of the data repository. 

179 config : `Config` 

180 A `Config` to update. Only the subset understood by 

181 this component will be updated. Will not expand 

182 defaults. 

183 full : `Config` 

184 A complete config with all defaults expanded that can be 

185 converted to a `DatastoreConfig`. Read-only and will not be 

186 modified by this method. 

187 Repository-specific options that should not be obtained 

188 from defaults when Butler instances are constructed 

189 should be copied from ``full`` to ``config``. 

190 overwrite : `bool`, optional 

191 If `False`, do not modify a value in ``config`` if the value 

192 already exists. Default is always to overwrite with the provided 

193 ``root``. 

194 

195 Notes 

196 ----- 

197 If a keyword is explicitly defined in the supplied ``config`` it 

198 will not be overridden by this method if ``overwrite`` is `False`. 

199 This allows explicit values set in external configs to be retained. 

200 """ 

201 Config.updateParameters(DatastoreConfig, config, full, 

202 toUpdate={"root": root}, 

203 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

204 

205 @classmethod 

206 def makeTableSpec(cls) -> ddl.TableSpec: 

207 return ddl.TableSpec( 

208 fields=[ 

209 ddl.FieldSpec(name="dataset_id", dtype=BigInteger, primaryKey=True), 

210 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

211 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

212 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

213 # Use empty string to indicate no component 

214 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

215 # TODO: should checksum be Base64Bytes instead? 

216 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

217 ddl.FieldSpec(name="file_size", dtype=BigInteger, nullable=True), 

218 ], 

219 unique=frozenset(), 

220 ) 

221 

222 def __init__(self, config: Union[DatastoreConfig, str], 

223 bridgeManager: DatastoreRegistryBridgeManager, butlerRoot: str = None): 

224 super().__init__(config, bridgeManager) 

225 if "root" not in self.config: 225 ↛ 226line 225 didn't jump to line 226, because the condition on line 225 was never true

226 raise ValueError("No root directory specified in configuration") 

227 

228 # Name ourselves either using an explicit name or a name 

229 # derived from the (unexpanded) root 

230 if "name" in self.config: 

231 self.name = self.config["name"] 

232 else: 

233 # We use the unexpanded root in the name to indicate that this 

234 # datastore can be moved without having to update registry. 

235 self.name = "{}@{}".format(type(self).__name__, 

236 self.config["root"]) 

237 

238 # Support repository relocation in config 

239 # Existence of self.root is checked in subclass 

240 self.root = replaceRoot(self.config["root"], butlerRoot) 

241 

242 self.locationFactory = LocationFactory(self.root) 

243 self.formatterFactory = FormatterFactory() 

244 

245 # Now associate formatters with storage classes 

246 self.formatterFactory.registerFormatters(self.config["formatters"], 

247 universe=bridgeManager.universe) 

248 

249 # Read the file naming templates 

250 self.templates = FileTemplates(self.config["templates"], 

251 universe=bridgeManager.universe) 

252 

253 # See if composites should be disassembled 

254 self.composites = CompositesMap(self.config["composites"], 

255 universe=bridgeManager.universe) 

256 

257 tableName = self.config["records", "table"] 

258 try: 

259 # Storage of paths and formatters, keyed by dataset_id 

260 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

261 # Interface to Registry. 

262 self._bridge = bridgeManager.register(self.name) 

263 except ReadOnlyDatabaseError: 

264 # If the database is read only and we just tried and failed to 

265 # create a table, it means someone is trying to create a read-only 

266 # butler client for an empty repo. That should be okay, as long 

267 # as they then try to get any datasets before some other client 

268 # creates the table. Chances are they'rejust validating 

269 # configuration. 

270 pass 

271 

272 # Determine whether checksums should be used 

273 self.useChecksum = self.config.get("checksum", True) 

274 

275 def __str__(self) -> str: 

276 return self.root 

277 

278 @property 

279 def bridge(self) -> DatastoreRegistryBridge: 

280 return self._bridge 

281 

282 @abstractmethod 

283 def _artifact_exists(self, location: Location) -> bool: 

284 """Check that an artifact exists in this datastore at the specified 

285 location. 

286 

287 Parameters 

288 ---------- 

289 location : `Location` 

290 Expected location of the artifact associated with this datastore. 

291 

292 Returns 

293 ------- 

294 exists : `bool` 

295 True if the location can be found, false otherwise. 

296 """ 

297 raise NotImplementedError() 

298 

299 @abstractmethod 

300 def _delete_artifact(self, location: Location) -> None: 

301 """Delete the artifact from the datastore. 

302 

303 Parameters 

304 ---------- 

305 location : `Location` 

306 Location of the artifact associated with this datastore. 

307 """ 

308 raise NotImplementedError() 

309 

310 def addStoredItemInfo(self, refs: Iterable[DatasetRef], infos: Iterable[StoredFileInfo]) -> None: 

311 # Docstring inherited from GenericBaseDatastore 

312 records = [] 

313 for ref, info in zip(refs, infos): 

314 # Component should come from ref and fall back on info 

315 component = ref.datasetType.component() 

316 if component is None and info.component is not None: 316 ↛ 317line 316 didn't jump to line 317, because the condition on line 316 was never true

317 component = info.component 

318 if component is None: 

319 # Use empty string since we want this to be part of the 

320 # primary key. 

321 component = NULLSTR 

322 records.append( 

323 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

324 storage_class=info.storageClass.name, component=component, 

325 checksum=info.checksum, file_size=info.file_size) 

326 ) 

327 self._table.insert(*records) 

328 

329 def getStoredItemInfo(self, ref: DatasetIdRef) -> StoredFileInfo: 

330 # Docstring inherited from GenericBaseDatastore 

331 

332 if ref.id is None: 332 ↛ 333line 332 didn't jump to line 333, because the condition on line 332 was never true

333 raise RuntimeError("Unable to retrieve information for unresolved DatasetRef") 

334 

335 where: Dict[str, Union[int, str]] = {"dataset_id": ref.id} 

336 

337 # If we have no component we want the row from this table without 

338 # a component. If we do have a component we either need the row 

339 # with no component or the row with the component, depending on how 

340 # this dataset was dissassembled. 

341 

342 # if we are emptying trash we won't have real refs so can't constrain 

343 # by component. Will need to fix this to return multiple matches 

344 # in future. 

345 component = None 

346 try: 

347 component = ref.datasetType.component() 

348 except AttributeError: 

349 pass 

350 else: 

351 if component is None: 351 ↛ 356line 351 didn't jump to line 356, because the condition on line 351 was never false

352 where["component"] = NULLSTR 

353 

354 # Look for the dataset_id -- there might be multiple matches 

355 # if we have disassembled the dataset. 

356 records = list(self._table.fetch(**where)) 

357 if len(records) == 0: 357 ↛ 358line 357 didn't jump to line 358, because the condition on line 357 was never true

358 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.") 

359 

360 # if we are not asking for a component 

361 if not component and len(records) != 1: 361 ↛ 362line 361 didn't jump to line 362, because the condition on line 361 was never true

362 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}") 

363 

364 # if we had a FakeDatasetRef we pick the first record regardless 

365 if isinstance(ref, FakeDatasetRef): 365 ↛ 366line 365 didn't jump to line 366, because the condition on line 365 was never true

366 record = records[0] 

367 else: 

368 records_by_component = {} 

369 for r in records: 

370 this_component = r["component"] if r["component"] and r["component"] != NULLSTR else None 

371 records_by_component[this_component] = r 

372 

373 # Look for component by name else fall back to the parent 

374 for lookup in (component, None): 374 ↛ 379line 374 didn't jump to line 379, because the loop on line 374 didn't complete

375 if lookup in records_by_component: 375 ↛ 374line 375 didn't jump to line 374, because the condition on line 375 was never false

376 record = records_by_component[lookup] 

377 break 

378 else: 

379 raise KeyError(f"Unable to retrieve location for component {component} associated with " 

380 f"dataset {ref}.") 

381 

382 # Convert name of StorageClass to instance 

383 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

384 

385 return StoredFileInfo(formatter=record["formatter"], 

386 path=record["path"], 

387 storageClass=storageClass, 

388 component=component, 

389 checksum=record["checksum"], 

390 file_size=record["file_size"]) 

391 

392 def getStoredItemsInfo(self, ref: DatasetIdRef) -> List[StoredFileInfo]: 

393 # Docstring inherited from GenericBaseDatastore 

394 

395 # Look for the dataset_id -- there might be multiple matches 

396 # if we have disassembled the dataset. 

397 records = list(self._table.fetch(dataset_id=ref.id)) 

398 

399 results = [] 

400 for record in records: 

401 # Convert name of StorageClass to instance 

402 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

403 component = record["component"] if (record["component"] 

404 and record["component"] != NULLSTR) else None 

405 

406 info = StoredFileInfo(formatter=record["formatter"], 

407 path=record["path"], 

408 storageClass=storageClass, 

409 component=component, 

410 checksum=record["checksum"], 

411 file_size=record["file_size"]) 

412 results.append(info) 

413 

414 return results 

415 

416 def _registered_refs_per_artifact(self, pathInStore: str) -> Set[int]: 

417 """Return all dataset refs associated with the supplied path. 

418 

419 Parameters 

420 ---------- 

421 pathInStore : `str` 

422 Path of interest in the data store. 

423 

424 Returns 

425 ------- 

426 ids : `set` of `int` 

427 All `DatasetRef` IDs associated with this path. 

428 """ 

429 records = list(self._table.fetch(path=pathInStore)) 

430 ids = {r["dataset_id"] for r in records} 

431 return ids 

432 

433 def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: 

434 # Docstring inherited from GenericBaseDatastore 

435 self._table.delete(dataset_id=ref.id) 

436 

437 def _get_dataset_location_info(self, 

438 ref: DatasetRef) -> Tuple[Optional[Location], Optional[StoredFileInfo]]: 

439 """Find the `Location` of the requested dataset in the 

440 `Datastore` and the associated stored file information. 

441 

442 Parameters 

443 ---------- 

444 ref : `DatasetRef` 

445 Reference to the required `Dataset`. 

446 

447 Returns 

448 ------- 

449 location : `Location` 

450 Location of the dataset within the datastore. 

451 Returns `None` if the dataset can not be located. 

452 info : `StoredFileInfo` 

453 Stored information about this file and its formatter. 

454 """ 

455 # Get the file information (this will fail if no file) 

456 try: 

457 storedFileInfo = self.getStoredItemInfo(ref) 

458 except KeyError: 

459 return None, None 

460 

461 # Use the path to determine the location 

462 location = self.locationFactory.fromPath(storedFileInfo.path) 

463 

464 return location, storedFileInfo 

465 

466 def _get_dataset_locations_info(self, ref: DatasetIdRef) -> List[Tuple[Location, StoredFileInfo]]: 

467 r"""Find all the `Location`\ s of the requested dataset in the 

468 `Datastore` and the associated stored file information. 

469 

470 Parameters 

471 ---------- 

472 ref : `DatasetRef` 

473 Reference to the required `Dataset`. 

474 

475 Returns 

476 ------- 

477 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

478 Location of the dataset within the datastore and 

479 stored information about each file and its formatter. 

480 """ 

481 # Get the file information (this will fail if no file) 

482 records = self.getStoredItemsInfo(ref) 

483 

484 # Use the path to determine the location 

485 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

486 

487 def _can_remove_dataset_artifact(self, ref: DatasetIdRef, location: Location) -> bool: 

488 """Check that there is only one dataset associated with the 

489 specified artifact. 

490 

491 Parameters 

492 ---------- 

493 ref : `DatasetRef` or `FakeDatasetRef` 

494 Dataset to be removed. 

495 location : `Location` 

496 The location of the artifact to be removed. 

497 

498 Returns 

499 ------- 

500 can_remove : `Bool` 

501 True if the artifact can be safely removed. 

502 """ 

503 

504 # Get all entries associated with this path 

505 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

506 if not allRefs: 506 ↛ 507line 506 didn't jump to line 507, because the condition on line 506 was never true

507 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

508 

509 # Remove these refs from all the refs and if there is nothing left 

510 # then we can delete 

511 remainingRefs = allRefs - {ref.id} 

512 

513 if remainingRefs: 

514 return False 

515 return True 

516 

517 def _prepare_for_get(self, ref: DatasetRef, 

518 parameters: Optional[Mapping[str, Any]] = None) -> List[DatastoreFileGetInformation]: 

519 """Check parameters for ``get`` and obtain formatter and 

520 location. 

521 

522 Parameters 

523 ---------- 

524 ref : `DatasetRef` 

525 Reference to the required Dataset. 

526 parameters : `dict` 

527 `StorageClass`-specific parameters that specify, for example, 

528 a slice of the dataset to be loaded. 

529 

530 Returns 

531 ------- 

532 getInfo : `list` [`DatastoreFileGetInformation`] 

533 Parameters needed to retrieve each file. 

534 """ 

535 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

536 

537 # Get file metadata and internal metadata 

538 fileLocations = self._get_dataset_locations_info(ref) 

539 if not fileLocations: 

540 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

541 

542 # The storage class we want to use eventually 

543 refStorageClass = ref.datasetType.storageClass 

544 

545 # Check that the supplied parameters are suitable for the type read 

546 refStorageClass.validateParameters(parameters) 

547 

548 if len(fileLocations) > 1: 

549 disassembled = True 

550 else: 

551 disassembled = False 

552 

553 # Is this a component request? 

554 refComponent = ref.datasetType.component() 

555 

556 fileGetInfo = [] 

557 for location, storedFileInfo in fileLocations: 

558 

559 # The storage class used to write the file 

560 writeStorageClass = storedFileInfo.storageClass 

561 

562 # If this has been disassembled we need read to match the write 

563 if disassembled: 

564 readStorageClass = writeStorageClass 

565 else: 

566 readStorageClass = refStorageClass 

567 

568 formatter = getInstanceOf(storedFileInfo.formatter, 

569 FileDescriptor(location, readStorageClass=readStorageClass, 

570 storageClass=writeStorageClass, parameters=parameters), 

571 ref.dataId) 

572 

573 _, notFormatterParams = formatter.segregateParameters() 

574 

575 # Of the remaining parameters, extract the ones supported by 

576 # this StorageClass (for components not all will be handled) 

577 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

578 

579 # The ref itself could be a component if the dataset was 

580 # disassembled by butler, or we disassembled in datastore and 

581 # components came from the datastore records 

582 component = storedFileInfo.component if storedFileInfo.component else refComponent 

583 

584 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

585 assemblerParams, component, readStorageClass)) 

586 

587 return fileGetInfo 

588 

589 def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> Tuple[Location, Formatter]: 

590 """Check the arguments for ``put`` and obtain formatter and 

591 location. 

592 

593 Parameters 

594 ---------- 

595 inMemoryDataset : `object` 

596 The dataset to store. 

597 ref : `DatasetRef` 

598 Reference to the associated Dataset. 

599 

600 Returns 

601 ------- 

602 location : `Location` 

603 The location to write the dataset. 

604 formatter : `Formatter` 

605 The `Formatter` to use to write the dataset. 

606 

607 Raises 

608 ------ 

609 TypeError 

610 Supplied object and storage class are inconsistent. 

611 DatasetTypeNotSupportedError 

612 The associated `DatasetType` is not handled by this datastore. 

613 """ 

614 self._validate_put_parameters(inMemoryDataset, ref) 

615 

616 # Work out output file name 

617 try: 

618 template = self.templates.getTemplate(ref) 

619 except KeyError as e: 

620 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

621 

622 location = self.locationFactory.fromPath(template.format(ref)) 

623 

624 # Get the formatter based on the storage class 

625 storageClass = ref.datasetType.storageClass 

626 try: 

627 formatter = self.formatterFactory.getFormatter(ref, 

628 FileDescriptor(location, 

629 storageClass=storageClass), 

630 ref.dataId) 

631 except KeyError as e: 

632 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e 

633 

634 # Now that we know the formatter, update the location 

635 location = formatter.makeUpdatedLocation(location) 

636 

637 return location, formatter 

638 

639 @abstractmethod 

640 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

641 """Standardize the path of a to-be-ingested file. 

642 

643 Parameters 

644 ---------- 

645 path : `str` 

646 Path of a file to be ingested. 

647 transfer : `str`, optional 

648 How (and whether) the dataset should be added to the datastore. 

649 See `ingest` for details of transfer modes. 

650 This implementation is provided only so 

651 `NotImplementedError` can be raised if the mode is not supported; 

652 actual transfers are deferred to `_extractIngestInfo`. 

653 

654 Returns 

655 ------- 

656 path : `str` 

657 New path in what the datastore considers standard form. 

658 

659 Notes 

660 ----- 

661 Subclasses of `FileLikeDatastore` should implement this method instead 

662 of `_prepIngest`. It should not modify the data repository or given 

663 file in any way. 

664 

665 Raises 

666 ------ 

667 NotImplementedError 

668 Raised if the datastore does not support the given transfer mode 

669 (including the case where ingest is not supported at all). 

670 FileNotFoundError 

671 Raised if one of the given files does not exist. 

672 """ 

673 raise NotImplementedError("Must be implemented by subclasses.") 

674 

675 @abstractmethod 

676 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, 

677 formatter: Union[Formatter, Type[Formatter]], 

678 transfer: Optional[str] = None) -> StoredFileInfo: 

679 """Relocate (if necessary) and extract `StoredFileInfo` from a 

680 to-be-ingested file. 

681 

682 Parameters 

683 ---------- 

684 path : `str` 

685 Path of a file to be ingested. 

686 ref : `DatasetRef` 

687 Reference for the dataset being ingested. Guaranteed to have 

688 ``dataset_id not None`. 

689 formatter : `type` or `Formatter` 

690 `Formatter` subclass to use for this dataset or an instance. 

691 transfer : `str`, optional 

692 How (and whether) the dataset should be added to the datastore. 

693 See `ingest` for details of transfer modes. 

694 

695 Returns 

696 ------- 

697 info : `StoredFileInfo` 

698 Internal datastore record for this file. This will be inserted by 

699 the caller; the `_extractIngestInfo` is only resposible for 

700 creating and populating the struct. 

701 

702 Raises 

703 ------ 

704 FileNotFoundError 

705 Raised if one of the given files does not exist. 

706 FileExistsError 

707 Raised if transfer is not `None` but the (internal) location the 

708 file would be moved to is already occupied. 

709 """ 

710 raise NotImplementedError("Must be implemented by subclasses.") 

711 

712 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

713 # Docstring inherited from Datastore._prepIngest. 

714 filtered = [] 

715 for dataset in datasets: 

716 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

717 if not acceptable: 

718 continue 

719 else: 

720 dataset.refs = acceptable 

721 if dataset.formatter is None: 

722 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

723 else: 

724 assert isinstance(dataset.formatter, (type, str)) 

725 dataset.formatter = getClassOf(dataset.formatter) 

726 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

727 filtered.append(dataset) 

728 return _IngestPrepData(filtered) 

729 

730 @transactional 

731 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None) -> None: 

732 # Docstring inherited from Datastore._finishIngest. 

733 refsAndInfos = [] 

734 for dataset in prepData.datasets: 

735 # Do ingest as if the first dataset ref is associated with the file 

736 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

737 transfer=transfer) 

738 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

739 self._register_datasets(refsAndInfos) 

740 

741 def _calculate_ingested_datastore_name(self, srcUri: ButlerURI, ref: DatasetRef, 

742 formatter: Union[Formatter, Type[Formatter]]) -> Location: 

743 """Given a source URI and a DatasetRef, determine the name the 

744 dataset will have inside datastore. 

745 

746 Parameters 

747 ---------- 

748 srcUri : `ButlerURI` 

749 URI to the source dataset file. 

750 ref : `DatasetRef` 

751 Ref associated with the newly-ingested dataset artifact. This 

752 is used to determine the name within the datastore. 

753 formatter : `Formatter` or Formatter class. 

754 Formatter to use for validation. Can be a class or an instance. 

755 

756 Returns 

757 ------- 

758 location : `Location` 

759 Target location for the newly-ingested dataset. 

760 """ 

761 # Ingesting a file from outside the datastore. 

762 # This involves a new name. 

763 template = self.templates.getTemplate(ref) 

764 location = self.locationFactory.fromPath(template.format(ref)) 

765 

766 # Get the extension 

767 ext = srcUri.getExtension() 

768 

769 # Update the destination to include that extension 

770 location.updateExtension(ext) 

771 

772 # Ask the formatter to validate this extension 

773 formatter.validateExtension(location) 

774 

775 return location 

776 

777 @abstractmethod 

778 def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: 

779 """Write out in memory dataset to datastore. 

780 

781 Parameters 

782 ---------- 

783 inMemoryDataset : `object` 

784 Dataset to write to datastore. 

785 ref : `DatasetRef` 

786 Registry information associated with this dataset. 

787 

788 Returns 

789 ------- 

790 info : `StoredFileInfo` 

791 Information describin the artifact written to the datastore. 

792 """ 

793 raise NotImplementedError() 

794 

795 @abstractmethod 

796 def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, 

797 ref: DatasetRef, isComponent: bool = False) -> Any: 

798 """Read the artifact from datastore into in memory object. 

799 

800 Parameters 

801 ---------- 

802 getInfo : `DatastoreFileGetInformation` 

803 Information about the artifact within the datastore. 

804 ref : `DatasetRef` 

805 The registry information associated with this artifact. 

806 isComponent : `bool` 

807 Flag to indicate if a component is being read from this artifact. 

808 

809 Returns 

810 ------- 

811 inMemoryDataset : `object` 

812 The artifact as a python object. 

813 """ 

814 raise NotImplementedError() 

815 

816 def exists(self, ref: DatasetRef) -> bool: 

817 """Check if the dataset exists in the datastore. 

818 

819 Parameters 

820 ---------- 

821 ref : `DatasetRef` 

822 Reference to the required dataset. 

823 

824 Returns 

825 ------- 

826 exists : `bool` 

827 `True` if the entity exists in the `Datastore`. 

828 """ 

829 fileLocations = self._get_dataset_locations_info(ref) 

830 if not fileLocations: 

831 return False 

832 for location, _ in fileLocations: 

833 if not self._artifact_exists(location): 

834 return False 

835 

836 return True 

837 

838 def getURIs(self, ref: DatasetRef, 

839 predict: bool = False) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

840 """Return URIs associated with dataset. 

841 

842 Parameters 

843 ---------- 

844 ref : `DatasetRef` 

845 Reference to the required dataset. 

846 predict : `bool`, optional 

847 If the datastore does not know about the dataset, should it 

848 return a predicted URI or not? 

849 

850 Returns 

851 ------- 

852 primary : `ButlerURI` 

853 The URI to the primary artifact associated with this dataset. 

854 If the dataset was disassembled within the datastore this 

855 may be `None`. 

856 components : `dict` 

857 URIs to any components associated with the dataset artifact. 

858 Can be empty if there are no components. 

859 """ 

860 

861 primary: Optional[ButlerURI] = None 

862 components: Dict[str, ButlerURI] = {} 

863 

864 # if this has never been written then we have to guess 

865 if not self.exists(ref): 

866 if not predict: 

867 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

868 

869 def predictLocation(thisRef: DatasetRef) -> Location: 

870 template = self.templates.getTemplate(thisRef) 

871 location = self.locationFactory.fromPath(template.format(thisRef)) 

872 storageClass = ref.datasetType.storageClass 

873 formatter = self.formatterFactory.getFormatter(thisRef, 

874 FileDescriptor(location, 

875 storageClass=storageClass)) 

876 # Try to use the extension attribute but ignore problems if the 

877 # formatter does not define one. 

878 try: 

879 location = formatter.makeUpdatedLocation(location) 

880 except Exception: 

881 # Use the default extension 

882 pass 

883 return location 

884 

885 doDisassembly = self.composites.shouldBeDisassembled(ref) 

886 

887 if doDisassembly: 

888 

889 for component, componentStorage in ref.datasetType.storageClass.components.items(): 

890 compTypeName = ref.datasetType.componentTypeName(component) 

891 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions, 

892 storageClass=componentStorage) 

893 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False) 

894 

895 compLocation = predictLocation(compRef) 

896 

897 # Add a URI fragment to indicate this is a guess 

898 components[component] = ButlerURI(compLocation.uri + "#predicted") 

899 

900 else: 

901 

902 location = predictLocation(ref) 

903 

904 # Add a URI fragment to indicate this is a guess 

905 primary = ButlerURI(location.uri + "#predicted") 

906 

907 return primary, components 

908 

909 # If this is a ref that we have written we can get the path. 

910 # Get file metadata and internal metadata 

911 fileLocations = self._get_dataset_locations_info(ref) 

912 

913 if not fileLocations: 913 ↛ 914line 913 didn't jump to line 914, because the condition on line 913 was never true

914 raise RuntimeError(f"Unexpectedly got no artifacts for dataset {ref}") 

915 

916 if len(fileLocations) == 1: 

917 # No disassembly so this is the primary URI 

918 primary = ButlerURI(fileLocations[0][0].uri) 

919 

920 else: 

921 for location, storedFileInfo in fileLocations: 

922 if storedFileInfo.component is None: 922 ↛ 923line 922 didn't jump to line 923, because the condition on line 922 was never true

923 raise RuntimeError(f"Unexpectedly got no component name for a component at {location}") 

924 components[storedFileInfo.component] = ButlerURI(location.uri) 

925 

926 return primary, components 

927 

928 def getURI(self, ref: DatasetRef, predict: bool = False) -> ButlerURI: 

929 """URI to the Dataset. 

930 

931 Parameters 

932 ---------- 

933 ref : `DatasetRef` 

934 Reference to the required Dataset. 

935 predict : `bool` 

936 If `True`, allow URIs to be returned of datasets that have not 

937 been written. 

938 

939 Returns 

940 ------- 

941 uri : `str` 

942 URI pointing to the dataset within the datastore. If the 

943 dataset does not exist in the datastore, and if ``predict`` is 

944 `True`, the URI will be a prediction and will include a URI 

945 fragment "#predicted". 

946 If the datastore does not have entities that relate well 

947 to the concept of a URI the returned URI will be 

948 descriptive. The returned URI is not guaranteed to be obtainable. 

949 

950 Raises 

951 ------ 

952 FileNotFoundError 

953 Raised if a URI has been requested for a dataset that does not 

954 exist and guessing is not allowed. 

955 RuntimeError 

956 Raised if a request is made for a single URI but multiple URIs 

957 are associated with this dataset. 

958 

959 Notes 

960 ----- 

961 When a predicted URI is requested an attempt will be made to form 

962 a reasonable URI based on file templates and the expected formatter. 

963 """ 

964 primary, components = self.getURIs(ref, predict) 

965 if primary is None or components: 965 ↛ 966line 965 didn't jump to line 966, because the condition on line 965 was never true

966 raise RuntimeError(f"Dataset ({ref}) includes distinct URIs for components. " 

967 "Use Dataastore.getURIs() instead.") 

968 return primary 

969 

970 def get(self, ref: DatasetRef, parameters: Optional[Mapping[str, Any]] = None) -> Any: 

971 """Load an InMemoryDataset from the store. 

972 

973 Parameters 

974 ---------- 

975 ref : `DatasetRef` 

976 Reference to the required Dataset. 

977 parameters : `dict` 

978 `StorageClass`-specific parameters that specify, for example, 

979 a slice of the dataset to be loaded. 

980 

981 Returns 

982 ------- 

983 inMemoryDataset : `object` 

984 Requested dataset or slice thereof as an InMemoryDataset. 

985 

986 Raises 

987 ------ 

988 FileNotFoundError 

989 Requested dataset can not be retrieved. 

990 TypeError 

991 Return value from formatter has unexpected type. 

992 ValueError 

993 Formatter failed to process the dataset. 

994 """ 

995 allGetInfo = self._prepare_for_get(ref, parameters) 

996 refComponent = ref.datasetType.component() 

997 

998 if len(allGetInfo) > 1 and not refComponent: 

999 # This was a disassembled dataset spread over multiple files 

1000 # and we need to put them all back together again. 

1001 # Read into memory and then assemble 

1002 usedParams = set() 

1003 components: Dict[str, Any] = {} 

1004 for getInfo in allGetInfo: 

1005 # assemblerParams are parameters not understood by the 

1006 # associated formatter. 

1007 usedParams.update(set(getInfo.assemblerParams)) 

1008 

1009 component = getInfo.component 

1010 

1011 if component is None: 1011 ↛ 1012line 1011 didn't jump to line 1012, because the condition on line 1011 was never true

1012 raise RuntimeError(f"Internal error in datastore assembly of {ref}") 

1013 

1014 # We do not want the formatter to think it's reading 

1015 # a component though because it is really reading a 

1016 # standalone dataset -- always tell reader it is not a 

1017 # component. 

1018 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

1019 

1020 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

1021 

1022 # Any unused parameters will have to be passed to the assembler 

1023 if parameters: 

1024 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

1025 else: 

1026 unusedParams = {} 

1027 

1028 # Process parameters 

1029 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

1030 parameters=unusedParams) 

1031 

1032 else: 

1033 # Single file request or component from that composite file 

1034 allComponents = {i.component: i for i in allGetInfo} 

1035 for lookup in (refComponent, None): 1035 ↛ 1040line 1035 didn't jump to line 1040, because the loop on line 1035 didn't complete

1036 if lookup in allComponents: 1036 ↛ 1035line 1036 didn't jump to line 1035, because the condition on line 1036 was never false

1037 getInfo = allComponents[lookup] 

1038 break 

1039 else: 

1040 raise FileNotFoundError(f"Component {refComponent} not found " 

1041 f"for ref {ref} in datastore {self.name}") 

1042 

1043 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None) 

1044 

1045 @transactional 

1046 def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: 

1047 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

1048 

1049 Parameters 

1050 ---------- 

1051 inMemoryDataset : `object` 

1052 The dataset to store. 

1053 ref : `DatasetRef` 

1054 Reference to the associated Dataset. 

1055 

1056 Raises 

1057 ------ 

1058 TypeError 

1059 Supplied object and storage class are inconsistent. 

1060 DatasetTypeNotSupportedError 

1061 The associated `DatasetType` is not handled by this datastore. 

1062 

1063 Notes 

1064 ----- 

1065 If the datastore is configured to reject certain dataset types it 

1066 is possible that the put will fail and raise a 

1067 `DatasetTypeNotSupportedError`. The main use case for this is to 

1068 allow `ChainedDatastore` to put to multiple datastores without 

1069 requiring that every datastore accepts the dataset. 

1070 """ 

1071 

1072 doDisassembly = self.composites.shouldBeDisassembled(ref) 

1073 # doDisassembly = True 

1074 

1075 artifacts = [] 

1076 if doDisassembly: 

1077 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset) 

1078 for component, componentInfo in components.items(): 

1079 compTypeName = ref.datasetType.componentTypeName(component) 

1080 # Don't recurse because we want to take advantage of 

1081 # bulk insert -- need a new DatasetRef that refers to the 

1082 # same dataset_id but has the component DatasetType 

1083 # DatasetType does not refer to the types of components 

1084 # So we construct one ourselves. 

1085 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions, 

1086 storageClass=componentInfo.storageClass) 

1087 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False) 

1088 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

1089 artifacts.append((compRef, storedInfo)) 

1090 else: 

1091 # Write the entire thing out 

1092 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

1093 artifacts.append((ref, storedInfo)) 

1094 

1095 self._register_datasets(artifacts) 

1096 

1097 @transactional 

1098 def trash(self, ref: DatasetRef, ignore_errors: bool = True) -> None: 

1099 """Indicate to the datastore that a dataset can be removed. 

1100 

1101 Parameters 

1102 ---------- 

1103 ref : `DatasetRef` 

1104 Reference to the required Dataset. 

1105 ignore_errors : `bool` 

1106 If `True` return without error even if something went wrong. 

1107 Problems could occur if another process is simultaneously trying 

1108 to delete. 

1109 

1110 Raises 

1111 ------ 

1112 FileNotFoundError 

1113 Attempt to remove a dataset that does not exist. 

1114 """ 

1115 # Get file metadata and internal metadata 

1116 log.debug("Trashing %s in datastore %s", ref, self.name) 

1117 

1118 fileLocations = self._get_dataset_locations_info(ref) 

1119 

1120 if not fileLocations: 

1121 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

1122 if ignore_errors: 

1123 log.warning(err_msg) 

1124 return 

1125 else: 

1126 raise FileNotFoundError(err_msg) 

1127 

1128 for location, storedFileInfo in fileLocations: 

1129 if not self._artifact_exists(location): 1129 ↛ 1130line 1129 didn't jump to line 1130, because the condition on line 1129 was never true

1130 err_msg = f"Dataset is known to datastore {self.name} but " \ 

1131 f"associated artifact ({location.uri}) is missing" 

1132 if ignore_errors: 

1133 log.warning(err_msg) 

1134 return 

1135 else: 

1136 raise FileNotFoundError(err_msg) 

1137 

1138 # Mark dataset as trashed 

1139 try: 

1140 self._move_to_trash_in_registry(ref) 

1141 except Exception as e: 

1142 if ignore_errors: 

1143 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

1144 f"but encountered an error: {e}") 

1145 pass 

1146 else: 

1147 raise 

1148 

1149 @transactional 

1150 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1151 """Remove all datasets from the trash. 

1152 

1153 Parameters 

1154 ---------- 

1155 ignore_errors : `bool` 

1156 If `True` return without error even if something went wrong. 

1157 Problems could occur if another process is simultaneously trying 

1158 to delete. 

1159 """ 

1160 log.debug("Emptying trash in datastore %s", self.name) 

1161 # Context manager will empty trash iff we finish it without raising. 

1162 with self._bridge.emptyTrash() as trashed: 

1163 for ref in trashed: 

1164 fileLocations = self._get_dataset_locations_info(ref) 

1165 

1166 if not fileLocations: 1166 ↛ 1167line 1166 didn't jump to line 1167, because the condition on line 1166 was never true

1167 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

1168 if ignore_errors: 

1169 log.warning(err_msg) 

1170 continue 

1171 else: 

1172 raise FileNotFoundError(err_msg) 

1173 

1174 for location, _ in fileLocations: 

1175 

1176 if not self._artifact_exists(location): 1176 ↛ 1177line 1176 didn't jump to line 1177, because the condition on line 1176 was never true

1177 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1178 if ignore_errors: 

1179 log.warning(err_msg) 

1180 continue 

1181 else: 

1182 raise FileNotFoundError(err_msg) 

1183 

1184 # Can only delete the artifact if there are no references 

1185 # to the file from untrashed dataset refs. 

1186 if self._can_remove_dataset_artifact(ref, location): 

1187 # Point of no return for this artifact 

1188 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1189 try: 

1190 self._delete_artifact(location) 

1191 except Exception as e: 

1192 if ignore_errors: 

1193 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1194 location.uri, self.name, e) 

1195 else: 

1196 raise 

1197 

1198 # Now must remove the entry from the internal registry even if 

1199 # the artifact removal failed and was ignored, 

1200 # otherwise the removal check above will never be true 

1201 try: 

1202 # There may be multiple rows associated with this ref 

1203 # depending on disassembly 

1204 self.removeStoredItemInfo(ref) 

1205 except Exception as e: 

1206 if ignore_errors: 

1207 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1208 ref.id, location.uri, self.name, e) 

1209 continue 

1210 else: 

1211 raise FileNotFoundError(err_msg) 

1212 

1213 def validateConfiguration(self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], 

1214 logFailures: bool = False) -> None: 

1215 """Validate some of the configuration for this datastore. 

1216 

1217 Parameters 

1218 ---------- 

1219 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1220 Entities to test against this configuration. Can be differing 

1221 types. 

1222 logFailures : `bool`, optional 

1223 If `True`, output a log message for every validation error 

1224 detected. 

1225 

1226 Raises 

1227 ------ 

1228 DatastoreValidationError 

1229 Raised if there is a validation problem with a configuration. 

1230 All the problems are reported in a single exception. 

1231 

1232 Notes 

1233 ----- 

1234 This method checks that all the supplied entities have valid file 

1235 templates and also have formatters defined. 

1236 """ 

1237 

1238 templateFailed = None 

1239 try: 

1240 self.templates.validateTemplates(entities, logFailures=logFailures) 

1241 except FileTemplateValidationError as e: 

1242 templateFailed = str(e) 

1243 

1244 formatterFailed = [] 

1245 for entity in entities: 

1246 try: 

1247 self.formatterFactory.getFormatterClass(entity) 

1248 except KeyError as e: 

1249 formatterFailed.append(str(e)) 

1250 if logFailures: 1250 ↛ 1245line 1250 didn't jump to line 1245, because the condition on line 1250 was never false

1251 log.fatal("Formatter failure: %s", e) 

1252 

1253 if templateFailed or formatterFailed: 

1254 messages = [] 

1255 if templateFailed: 1255 ↛ 1256line 1255 didn't jump to line 1256, because the condition on line 1255 was never true

1256 messages.append(templateFailed) 

1257 if formatterFailed: 1257 ↛ 1259line 1257 didn't jump to line 1259, because the condition on line 1257 was never false

1258 messages.append(",".join(formatterFailed)) 

1259 msg = ";\n".join(messages) 

1260 raise DatastoreValidationError(msg) 

1261 

1262 def getLookupKeys(self) -> Set[LookupKey]: 

1263 # Docstring is inherited from base class 

1264 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1265 self.constraints.getLookupKeys() 

1266 

1267 def validateKey(self, lookupKey: LookupKey, 

1268 entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1269 # Docstring is inherited from base class 

1270 # The key can be valid in either formatters or templates so we can 

1271 # only check the template if it exists 

1272 if lookupKey in self.templates: 

1273 try: 

1274 self.templates[lookupKey].validateTemplate(entity) 

1275 except FileTemplateValidationError as e: 

1276 raise DatastoreValidationError(e) from e