Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21from __future__ import annotations 

22 

23"""Generic file-based datastore code.""" 

24 

25__all__ = ("FileLikeDatastore", ) 

26 

27import logging 

28from abc import abstractmethod 

29 

30from sqlalchemy import Integer, String 

31 

32from dataclasses import dataclass 

33from typing import Optional, List, Type 

34 

35from lsst.daf.butler import ( 

36 CompositesMap, 

37 Config, 

38 FileDataset, 

39 DatasetRef, 

40 DatasetType, 

41 DatasetTypeNotSupportedError, 

42 Datastore, 

43 DatastoreConfig, 

44 DatastoreValidationError, 

45 FileDescriptor, 

46 FileTemplates, 

47 FileTemplateValidationError, 

48 Formatter, 

49 FormatterFactory, 

50 Location, 

51 LocationFactory, 

52 StorageClass, 

53 StoredFileInfo, 

54) 

55 

56from lsst.daf.butler import ddl 

57from lsst.daf.butler.registry.interfaces import ( 

58 ReadOnlyDatabaseError, 

59 DatastoreRegistryBridge, 

60 FakeDatasetRef, 

61) 

62 

63from lsst.daf.butler.core.repoRelocation import replaceRoot 

64from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional 

65from .genericDatastore import GenericBaseDatastore 

66 

67log = logging.getLogger(__name__) 

68 

69# String to use when a Python None is encountered 

70NULLSTR = "__NULL_STRING__" 

71 

72 

73class _IngestPrepData(Datastore.IngestPrepData): 

74 """Helper class for FileLikeDatastore ingest implementation. 

75 

76 Parameters 

77 ---------- 

78 datasets : `list` of `FileDataset` 

79 Files to be ingested by this datastore. 

80 """ 

81 def __init__(self, datasets: List[FileDataset]): 

82 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

83 self.datasets = datasets 

84 

85 

86@dataclass(frozen=True) 

87class DatastoreFileGetInformation: 

88 """Collection of useful parameters needed to retrieve a file from 

89 a Datastore. 

90 """ 

91 

92 location: Location 

93 """The location from which to read the dataset.""" 

94 

95 formatter: Formatter 

96 """The `Formatter` to use to deserialize the dataset.""" 

97 

98 info: StoredFileInfo 

99 """Stored information about this file and its formatter.""" 

100 

101 assemblerParams: dict 

102 """Parameters to use for post-processing the retrieved dataset.""" 

103 

104 component: Optional[str] 

105 """The component to be retrieved (can be `None`).""" 

106 

107 readStorageClass: StorageClass 

108 """The `StorageClass` of the dataset being read.""" 

109 

110 

111class FileLikeDatastore(GenericBaseDatastore): 

112 """Generic Datastore for file-based implementations. 

113 

114 Should always be sub-classed since key abstract methods are missing. 

115 

116 Parameters 

117 ---------- 

118 config : `DatastoreConfig` or `str` 

119 Configuration as either a `Config` object or URI to file. 

120 bridgeManager : `DatastoreRegistryBridgeManager` 

121 Object that manages the interface between `Registry` and datastores. 

122 butlerRoot : `str`, optional 

123 New datastore root to use to override the configuration value. 

124 

125 Raises 

126 ------ 

127 ValueError 

128 If root location does not exist and ``create`` is `False` in the 

129 configuration. 

130 """ 

131 

132 defaultConfigFile = None 

133 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

134 absolute path. Can be None if no defaults specified. 

135 """ 

136 

137 root: str 

138 """Root directory or URI of this `Datastore`.""" 

139 

140 locationFactory: LocationFactory 

141 """Factory for creating locations relative to the datastore root.""" 

142 

143 formatterFactory: FormatterFactory 

144 """Factory for creating instances of formatters.""" 

145 

146 templates: FileTemplates 

147 """File templates that can be used by this `Datastore`.""" 

148 

149 composites: CompositesMap 

150 """Determines whether a dataset should be disassembled on put.""" 

151 

152 @classmethod 

153 def setConfigRoot(cls, root, config, full, overwrite=True): 

154 """Set any filesystem-dependent config options for this Datastore to 

155 be appropriate for a new empty repository with the given root. 

156 

157 Parameters 

158 ---------- 

159 root : `str` 

160 URI to the root of the data repository. 

161 config : `Config` 

162 A `Config` to update. Only the subset understood by 

163 this component will be updated. Will not expand 

164 defaults. 

165 full : `Config` 

166 A complete config with all defaults expanded that can be 

167 converted to a `DatastoreConfig`. Read-only and will not be 

168 modified by this method. 

169 Repository-specific options that should not be obtained 

170 from defaults when Butler instances are constructed 

171 should be copied from ``full`` to ``config``. 

172 overwrite : `bool`, optional 

173 If `False`, do not modify a value in ``config`` if the value 

174 already exists. Default is always to overwrite with the provided 

175 ``root``. 

176 

177 Notes 

178 ----- 

179 If a keyword is explicitly defined in the supplied ``config`` it 

180 will not be overridden by this method if ``overwrite`` is `False`. 

181 This allows explicit values set in external configs to be retained. 

182 """ 

183 Config.updateParameters(DatastoreConfig, config, full, 

184 toUpdate={"root": root}, 

185 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

186 

187 @classmethod 

188 def makeTableSpec(cls): 

189 return ddl.TableSpec( 

190 fields=NamedValueSet([ 

191 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True), 

192 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

193 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

194 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

195 # Use empty string to indicate no component 

196 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

197 # TODO: should checksum be Base64Bytes instead? 

198 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

199 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True), 

200 ]), 

201 unique=frozenset(), 

202 ) 

203 

204 def __init__(self, config, bridgeManager, butlerRoot=None): 

205 super().__init__(config, bridgeManager) 

206 if "root" not in self.config: 206 ↛ 207line 206 didn't jump to line 207, because the condition on line 206 was never true

207 raise ValueError("No root directory specified in configuration") 

208 

209 # Name ourselves either using an explicit name or a name 

210 # derived from the (unexpanded) root 

211 if "name" in self.config: 

212 self.name = self.config["name"] 

213 else: 

214 # We use the unexpanded root in the name to indicate that this 

215 # datastore can be moved without having to update registry. 

216 self.name = "{}@{}".format(type(self).__name__, 

217 self.config["root"]) 

218 

219 # Support repository relocation in config 

220 # Existence of self.root is checked in subclass 

221 self.root = replaceRoot(self.config["root"], butlerRoot) 

222 

223 self.locationFactory = LocationFactory(self.root) 

224 self.formatterFactory = FormatterFactory() 

225 

226 # Now associate formatters with storage classes 

227 self.formatterFactory.registerFormatters(self.config["formatters"], 

228 universe=bridgeManager.universe) 

229 

230 # Read the file naming templates 

231 self.templates = FileTemplates(self.config["templates"], 

232 universe=bridgeManager.universe) 

233 

234 # See if composites should be disassembled 

235 self.composites = CompositesMap(self.config["composites"], 

236 universe=bridgeManager.universe) 

237 

238 tableName = self.config["records", "table"] 

239 try: 

240 # Storage of paths and formatters, keyed by dataset_id 

241 self._table = bridgeManager.opaque.register(tableName, self.makeTableSpec()) 

242 # Interface to Registry. 

243 self._bridge = bridgeManager.register(self.name) 

244 except ReadOnlyDatabaseError: 

245 # If the database is read only and we just tried and failed to 

246 # create a table, it means someone is trying to create a read-only 

247 # butler client for an empty repo. That should be okay, as long 

248 # as they then try to get any datasets before some other client 

249 # creates the table. Chances are they'rejust validating 

250 # configuration. 

251 pass 

252 

253 # Determine whether checksums should be used 

254 self.useChecksum = self.config.get("checksum", True) 

255 

256 def __str__(self): 

257 return self.root 

258 

259 @property 

260 def bridge(self) -> DatastoreRegistryBridge: 

261 return self._bridge 

262 

263 @abstractmethod 

264 def _artifact_exists(self, location): 

265 """Check that an artifact exists in this datastore at the specified 

266 location. 

267 

268 Parameters 

269 ---------- 

270 location : `Location` 

271 Expected location of the artifact associated with this datastore. 

272 

273 Returns 

274 ------- 

275 exists : `bool` 

276 True if the location can be found, false otherwise. 

277 """ 

278 raise NotImplementedError() 

279 

280 @abstractmethod 

281 def _delete_artifact(self, location): 

282 """Delete the artifact from the datastore. 

283 

284 Parameters 

285 ---------- 

286 location : `Location` 

287 Location of the artifact associated with this datastore. 

288 """ 

289 raise NotImplementedError() 

290 

291 def addStoredItemInfo(self, refs, infos): 

292 # Docstring inherited from GenericBaseDatastore 

293 records = [] 

294 for ref, info in zip(refs, infos): 

295 # Component should come from ref and fall back on info 

296 component = ref.datasetType.component() 

297 if component is None and info.component is not None: 297 ↛ 298line 297 didn't jump to line 298, because the condition on line 297 was never true

298 component = info.component 

299 if component is None: 

300 # Use empty string since we want this to be part of the 

301 # primary key. 

302 component = NULLSTR 

303 records.append( 

304 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

305 storage_class=info.storageClass.name, component=component, 

306 checksum=info.checksum, file_size=info.file_size) 

307 ) 

308 self._table.insert(*records) 

309 

310 def getStoredItemInfo(self, ref): 

311 # Docstring inherited from GenericBaseDatastore 

312 

313 where = {"dataset_id": ref.id} 

314 

315 # If we have no component we want the row from this table without 

316 # a component. If we do have a component we either need the row 

317 # with no component or the row with the component, depending on how 

318 # this dataset was dissassembled. 

319 

320 # if we are emptying trash we won't have real refs so can't constrain 

321 # by component. Will need to fix this to return multiple matches 

322 # in future. 

323 try: 

324 component = ref.datasetType.component() 

325 except AttributeError: 

326 component = None 

327 else: 

328 if component is None: 

329 where["component"] = NULLSTR 

330 

331 # Look for the dataset_id -- there might be multiple matches 

332 # if we have disassembled the dataset. 

333 records = list(self._table.fetch(**where)) 

334 if len(records) == 0: 334 ↛ 335line 334 didn't jump to line 335, because the condition on line 334 was never true

335 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.") 

336 

337 # if we are not asking for a component 

338 if not component and len(records) != 1: 338 ↛ 339line 338 didn't jump to line 339, because the condition on line 338 was never true

339 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}") 

340 

341 # if we had a FakeDatasetRef we pick the first record regardless 

342 if isinstance(ref, FakeDatasetRef): 342 ↛ 343line 342 didn't jump to line 343, because the condition on line 342 was never true

343 record = records[0] 

344 else: 

345 records_by_component = {} 

346 for r in records: 

347 this_component = r["component"] if r["component"] and r["component"] != NULLSTR else None 

348 records_by_component[this_component] = r 

349 

350 # Look for component by name else fall back to the parent 

351 for lookup in (component, None): 351 ↛ 356line 351 didn't jump to line 356, because the loop on line 351 didn't complete

352 if lookup in records_by_component: 352 ↛ 351line 352 didn't jump to line 351, because the condition on line 352 was never false

353 record = records_by_component[lookup] 

354 break 

355 else: 

356 raise KeyError(f"Unable to retrieve location for component {component} associated with " 

357 f"dataset {ref}.") 

358 

359 # Convert name of StorageClass to instance 

360 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

361 

362 return StoredFileInfo(formatter=record["formatter"], 

363 path=record["path"], 

364 storageClass=storageClass, 

365 component=component, 

366 checksum=record["checksum"], 

367 file_size=record["file_size"]) 

368 

369 def getStoredItemsInfo(self, ref): 

370 # Docstring inherited from GenericBaseDatastore 

371 

372 # Look for the dataset_id -- there might be multiple matches 

373 # if we have disassembled the dataset. 

374 records = list(self._table.fetch(dataset_id=ref.id)) 

375 

376 results = [] 

377 for record in records: 

378 # Convert name of StorageClass to instance 

379 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

380 component = record["component"] if (record["component"] 

381 and record["component"] != NULLSTR) else None 

382 

383 info = StoredFileInfo(formatter=record["formatter"], 

384 path=record["path"], 

385 storageClass=storageClass, 

386 component=component, 

387 checksum=record["checksum"], 

388 file_size=record["file_size"]) 

389 results.append(info) 

390 

391 return results 

392 

393 def _registered_refs_per_artifact(self, pathInStore): 

394 """Return all dataset refs associated with the supplied path. 

395 

396 Parameters 

397 ---------- 

398 pathInStore : `str` 

399 Path of interest in the data store. 

400 

401 Returns 

402 ------- 

403 ids : `set` of `int` 

404 All `DatasetRef` IDs associated with this path. 

405 """ 

406 records = list(self._table.fetch(path=pathInStore)) 

407 ids = {r["dataset_id"] for r in records} 

408 return ids 

409 

410 def removeStoredItemInfo(self, ref): 

411 # Docstring inherited from GenericBaseDatastore 

412 self._table.delete(dataset_id=ref.id) 

413 

414 def _get_dataset_location_info(self, ref): 

415 """Find the `Location` of the requested dataset in the 

416 `Datastore` and the associated stored file information. 

417 

418 Parameters 

419 ---------- 

420 ref : `DatasetRef` 

421 Reference to the required `Dataset`. 

422 

423 Returns 

424 ------- 

425 location : `Location` 

426 Location of the dataset within the datastore. 

427 Returns `None` if the dataset can not be located. 

428 info : `StoredFileInfo` 

429 Stored information about this file and its formatter. 

430 """ 

431 # Get the file information (this will fail if no file) 

432 try: 

433 storedFileInfo = self.getStoredItemInfo(ref) 

434 except KeyError: 

435 return None, None 

436 

437 # Use the path to determine the location 

438 location = self.locationFactory.fromPath(storedFileInfo.path) 

439 

440 return location, storedFileInfo 

441 

442 def _get_dataset_locations_info(self, ref): 

443 r"""Find all the `Location`\ s of the requested dataset in the 

444 `Datastore` and the associated stored file information. 

445 

446 Parameters 

447 ---------- 

448 ref : `DatasetRef` 

449 Reference to the required `Dataset`. 

450 

451 Returns 

452 ------- 

453 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

454 Location of the dataset within the datastore and 

455 stored information about each file and its formatter. 

456 """ 

457 # Get the file information (this will fail if no file) 

458 records = self.getStoredItemsInfo(ref) 

459 

460 # Use the path to determine the location 

461 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

462 

463 def _can_remove_dataset_artifact(self, ref, location): 

464 """Check that there is only one dataset associated with the 

465 specified artifact. 

466 

467 Parameters 

468 ---------- 

469 ref : `DatasetRef` 

470 Dataset to be removed. 

471 location : `Location` 

472 The location of the artifact to be removed. 

473 

474 Returns 

475 ------- 

476 can_remove : `Bool` 

477 True if the artifact can be safely removed. 

478 """ 

479 

480 # Get all entries associated with this path 

481 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

482 if not allRefs: 482 ↛ 483line 482 didn't jump to line 483, because the condition on line 482 was never true

483 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

484 

485 # Get all the refs associated with this dataset if it is a composite 

486 theseRefs = {r.id for r in ref.flatten([ref])} 

487 

488 # Remove these refs from all the refs and if there is nothing left 

489 # then we can delete 

490 remainingRefs = allRefs - theseRefs 

491 

492 if remainingRefs: 

493 return False 

494 return True 

495 

496 def _prepare_for_get(self, ref, parameters=None): 

497 """Check parameters for ``get`` and obtain formatter and 

498 location. 

499 

500 Parameters 

501 ---------- 

502 ref : `DatasetRef` 

503 Reference to the required Dataset. 

504 parameters : `dict` 

505 `StorageClass`-specific parameters that specify, for example, 

506 a slice of the dataset to be loaded. 

507 

508 Returns 

509 ------- 

510 getInfo : `list` [`DatastoreFileGetInformation`] 

511 Parameters needed to retrieve each file. 

512 """ 

513 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

514 

515 # Get file metadata and internal metadata 

516 fileLocations = self._get_dataset_locations_info(ref) 

517 if not fileLocations: 

518 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

519 

520 # The storage class we want to use eventually 

521 refStorageClass = ref.datasetType.storageClass 

522 

523 # Check that the supplied parameters are suitable for the type read 

524 refStorageClass.validateParameters(parameters) 

525 

526 if len(fileLocations) > 1: 

527 disassembled = True 

528 else: 

529 disassembled = False 

530 

531 # Is this a component request? 

532 refComponent = ref.datasetType.component() 

533 

534 fileGetInfo = [] 

535 for location, storedFileInfo in fileLocations: 

536 

537 # The storage class used to write the file 

538 writeStorageClass = storedFileInfo.storageClass 

539 

540 # If this has been disassembled we need read to match the write 

541 if disassembled: 

542 readStorageClass = writeStorageClass 

543 else: 

544 readStorageClass = refStorageClass 

545 

546 formatter = getInstanceOf(storedFileInfo.formatter, 

547 FileDescriptor(location, readStorageClass=readStorageClass, 

548 storageClass=writeStorageClass, parameters=parameters), 

549 ref.dataId) 

550 

551 _, notFormatterParams = formatter.segregateParameters() 

552 

553 # Of the remaining parameters, extract the ones supported by 

554 # this StorageClass (for components not all will be handled) 

555 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

556 

557 # The ref itself could be a component if the dataset was 

558 # disassembled by butler, or we disassembled in datastore and 

559 # components came from the datastore records 

560 if storedFileInfo.component: 

561 component = storedFileInfo.component 

562 else: 

563 component = refComponent 

564 

565 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

566 assemblerParams, component, readStorageClass)) 

567 

568 return fileGetInfo 

569 

570 def _prepare_for_put(self, inMemoryDataset, ref): 

571 """Check the arguments for ``put`` and obtain formatter and 

572 location. 

573 

574 Parameters 

575 ---------- 

576 inMemoryDataset : `object` 

577 The dataset to store. 

578 ref : `DatasetRef` 

579 Reference to the associated Dataset. 

580 

581 Returns 

582 ------- 

583 location : `Location` 

584 The location to write the dataset. 

585 formatter : `Formatter` 

586 The `Formatter` to use to write the dataset. 

587 

588 Raises 

589 ------ 

590 TypeError 

591 Supplied object and storage class are inconsistent. 

592 DatasetTypeNotSupportedError 

593 The associated `DatasetType` is not handled by this datastore. 

594 """ 

595 self._validate_put_parameters(inMemoryDataset, ref) 

596 

597 # Work out output file name 

598 try: 

599 template = self.templates.getTemplate(ref) 

600 except KeyError as e: 

601 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

602 

603 location = self.locationFactory.fromPath(template.format(ref)) 

604 

605 # Get the formatter based on the storage class 

606 storageClass = ref.datasetType.storageClass 

607 try: 

608 formatter = self.formatterFactory.getFormatter(ref, 

609 FileDescriptor(location, 

610 storageClass=storageClass), 

611 ref.dataId) 

612 except KeyError as e: 

613 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e 

614 

615 return location, formatter 

616 

617 @abstractmethod 

618 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

619 """Standardize the path of a to-be-ingested file. 

620 

621 Parameters 

622 ---------- 

623 path : `str` 

624 Path of a file to be ingested. 

625 transfer : `str`, optional 

626 How (and whether) the dataset should be added to the datastore. 

627 See `ingest` for details of transfer modes. 

628 This implementation is provided only so 

629 `NotImplementedError` can be raised if the mode is not supported; 

630 actual transfers are deferred to `_extractIngestInfo`. 

631 

632 Returns 

633 ------- 

634 path : `str` 

635 New path in what the datastore considers standard form. 

636 

637 Notes 

638 ----- 

639 Subclasses of `FileLikeDatastore` should implement this method instead 

640 of `_prepIngest`. It should not modify the data repository or given 

641 file in any way. 

642 

643 Raises 

644 ------ 

645 NotImplementedError 

646 Raised if the datastore does not support the given transfer mode 

647 (including the case where ingest is not supported at all). 

648 FileNotFoundError 

649 Raised if one of the given files does not exist. 

650 """ 

651 raise NotImplementedError("Must be implemented by subclasses.") 

652 

653 @abstractmethod 

654 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

655 transfer: Optional[str] = None) -> StoredFileInfo: 

656 """Relocate (if necessary) and extract `StoredFileInfo` from a 

657 to-be-ingested file. 

658 

659 Parameters 

660 ---------- 

661 path : `str` 

662 Path of a file to be ingested. 

663 ref : `DatasetRef` 

664 Reference for the dataset being ingested. Guaranteed to have 

665 ``dataset_id not None`. 

666 formatter : `type` 

667 `Formatter` subclass to use for this dataset. 

668 transfer : `str`, optional 

669 How (and whether) the dataset should be added to the datastore. 

670 See `ingest` for details of transfer modes. 

671 

672 Returns 

673 ------- 

674 info : `StoredFileInfo` 

675 Internal datastore record for this file. This will be inserted by 

676 the caller; the `_extractIngestInfo` is only resposible for 

677 creating and populating the struct. 

678 

679 Raises 

680 ------ 

681 FileNotFoundError 

682 Raised if one of the given files does not exist. 

683 FileExistsError 

684 Raised if transfer is not `None` but the (internal) location the 

685 file would be moved to is already occupied. 

686 """ 

687 raise NotImplementedError("Must be implemented by subclasses.") 

688 

689 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

690 # Docstring inherited from Datastore._prepIngest. 

691 filtered = [] 

692 for dataset in datasets: 

693 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

694 if not acceptable: 

695 continue 

696 else: 

697 dataset.refs = acceptable 

698 if dataset.formatter is None: 

699 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

700 else: 

701 dataset.formatter = getClassOf(dataset.formatter) 

702 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

703 filtered.append(dataset) 

704 return _IngestPrepData(filtered) 

705 

706 @transactional 

707 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None): 

708 # Docstring inherited from Datastore._finishIngest. 

709 refsAndInfos = [] 

710 for dataset in prepData.datasets: 

711 # Do ingest as if the first dataset ref is associated with the file 

712 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

713 transfer=transfer) 

714 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

715 self._register_datasets(refsAndInfos) 

716 

717 def exists(self, ref): 

718 """Check if the dataset exists in the datastore. 

719 

720 Parameters 

721 ---------- 

722 ref : `DatasetRef` 

723 Reference to the required dataset. 

724 

725 Returns 

726 ------- 

727 exists : `bool` 

728 `True` if the entity exists in the `Datastore`. 

729 """ 

730 fileLocations = self._get_dataset_locations_info(ref) 

731 if not fileLocations: 

732 return False 

733 for location, _ in fileLocations: 

734 if not self._artifact_exists(location): 

735 return False 

736 

737 return True 

738 

739 def getUri(self, ref, predict=False): 

740 """URI to the Dataset. 

741 

742 Parameters 

743 ---------- 

744 ref : `DatasetRef` 

745 Reference to the required Dataset. 

746 predict : `bool` 

747 If `True`, allow URIs to be returned of datasets that have not 

748 been written. 

749 

750 Returns 

751 ------- 

752 uri : `str` 

753 URI string pointing to the dataset within the datastore. If the 

754 dataset does not exist in the datastore, and if ``predict`` is 

755 `True`, the URI will be a prediction and will include a URI 

756 fragment "#predicted". 

757 If the datastore does not have entities that relate well 

758 to the concept of a URI the returned URI string will be 

759 descriptive. The returned URI is not guaranteed to be obtainable. 

760 

761 Raises 

762 ------ 

763 FileNotFoundError 

764 A URI has been requested for a dataset that does not exist and 

765 guessing is not allowed. 

766 

767 Notes 

768 ----- 

769 When a predicted URI is requested an attempt will be made to form 

770 a reasonable URI based on file templates and the expected formatter. 

771 """ 

772 # if this has never been written then we have to guess 

773 if not self.exists(ref): 

774 if not predict: 

775 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

776 

777 template = self.templates.getTemplate(ref) 

778 location = self.locationFactory.fromPath(template.format(ref)) 

779 storageClass = ref.datasetType.storageClass 

780 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location, 

781 storageClass=storageClass)) 

782 # Try to use the extension attribute but ignore problems if the 

783 # formatter does not define one. 

784 try: 

785 location = formatter.makeUpdatedLocation(location) 

786 except Exception: 

787 # Use the default extension 

788 pass 

789 

790 # Add a URI fragment to indicate this is a guess 

791 return location.uri + "#predicted" 

792 

793 # If this is a ref that we have written we can get the path. 

794 # Get file metadata and internal metadata 

795 storedFileInfo = self.getStoredItemInfo(ref) 

796 

797 # Use the path to determine the location 

798 location = self.locationFactory.fromPath(storedFileInfo.path) 

799 

800 return location.uri 

801 

802 def get(self, ref, parameters=None): 

803 """Load an InMemoryDataset from the store. 

804 

805 Parameters 

806 ---------- 

807 ref : `DatasetRef` 

808 Reference to the required Dataset. 

809 parameters : `dict` 

810 `StorageClass`-specific parameters that specify, for example, 

811 a slice of the dataset to be loaded. 

812 

813 Returns 

814 ------- 

815 inMemoryDataset : `object` 

816 Requested dataset or slice thereof as an InMemoryDataset. 

817 

818 Raises 

819 ------ 

820 FileNotFoundError 

821 Requested dataset can not be retrieved. 

822 TypeError 

823 Return value from formatter has unexpected type. 

824 ValueError 

825 Formatter failed to process the dataset. 

826 """ 

827 allGetInfo = self._prepare_for_get(ref, parameters) 

828 refComponent = ref.datasetType.component() 

829 

830 if len(allGetInfo) > 1 and not refComponent: 

831 # This was a disassembled dataset spread over multiple files 

832 # and we need to put them all back together again. 

833 # Read into memory and then assemble 

834 usedParams = set() 

835 components = {} 

836 for getInfo in allGetInfo: 

837 # assemblerParams are parameters not understood by the 

838 # associated formatter. 

839 usedParams.update(set(getInfo.assemblerParams)) 

840 

841 component = getInfo.component 

842 # We do not want the formatter to think it's reading 

843 # a component though because it is really reading a 

844 # standalone dataset -- always tell reader it is not a 

845 # component. 

846 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

847 

848 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

849 

850 # Any unused parameters will have to be passed to the assembler 

851 if parameters: 

852 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

853 else: 

854 unusedParams = {} 

855 

856 # Process parameters 

857 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

858 parameters=unusedParams) 

859 

860 else: 

861 # Single file request or component from that composite file 

862 allComponents = {i.component: i for i in allGetInfo} 

863 for lookup in (refComponent, None): 863 ↛ 868line 863 didn't jump to line 868, because the loop on line 863 didn't complete

864 if lookup in allComponents: 864 ↛ 863line 864 didn't jump to line 863, because the condition on line 864 was never false

865 getInfo = allComponents[lookup] 

866 break 

867 else: 

868 raise FileNotFoundError(f"Component {refComponent} not found " 

869 f"for ref {ref} in datastore {self.name}") 

870 

871 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None) 

872 

873 @transactional 

874 def put(self, inMemoryDataset, ref): 

875 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

876 

877 Parameters 

878 ---------- 

879 inMemoryDataset : `object` 

880 The dataset to store. 

881 ref : `DatasetRef` 

882 Reference to the associated Dataset. 

883 

884 Raises 

885 ------ 

886 TypeError 

887 Supplied object and storage class are inconsistent. 

888 DatasetTypeNotSupportedError 

889 The associated `DatasetType` is not handled by this datastore. 

890 

891 Notes 

892 ----- 

893 If the datastore is configured to reject certain dataset types it 

894 is possible that the put will fail and raise a 

895 `DatasetTypeNotSupportedError`. The main use case for this is to 

896 allow `ChainedDatastore` to put to multiple datastores without 

897 requiring that every datastore accepts the dataset. 

898 """ 

899 

900 doDisassembly = self.composites.shouldBeDisassembled(ref) 

901 # doDisassembly = True 

902 

903 artifacts = [] 

904 if doDisassembly: 

905 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset) 

906 for component, componentInfo in components.items(): 

907 compTypeName = ref.datasetType.componentTypeName(component) 

908 # Don't recurse because we want to take advantage of 

909 # bulk insert -- need a new DatasetRef that refers to the 

910 # same dataset_id but has the component DatasetType 

911 # DatasetType does not refer to the types of components 

912 # So we construct one ourselves. 

913 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions, 

914 storageClass=componentInfo.storageClass) 

915 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False) 

916 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

917 artifacts.append((compRef, storedInfo)) 

918 else: 

919 # Write the entire thing out 

920 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

921 artifacts.append((ref, storedInfo)) 

922 

923 self._register_datasets(artifacts) 

924 

925 @transactional 

926 def trash(self, ref, ignore_errors=True): 

927 """Indicate to the datastore that a dataset can be removed. 

928 

929 Parameters 

930 ---------- 

931 ref : `DatasetRef` 

932 Reference to the required Dataset. 

933 ignore_errors : `bool` 

934 If `True` return without error even if something went wrong. 

935 Problems could occur if another process is simultaneously trying 

936 to delete. 

937 

938 Raises 

939 ------ 

940 FileNotFoundError 

941 Attempt to remove a dataset that does not exist. 

942 """ 

943 # Get file metadata and internal metadata 

944 log.debug("Trashing %s in datastore %s", ref, self.name) 

945 

946 fileLocations = self._get_dataset_locations_info(ref) 

947 

948 if not fileLocations: 

949 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

950 if ignore_errors: 

951 log.warning(err_msg) 

952 return 

953 else: 

954 raise FileNotFoundError(err_msg) 

955 

956 for location, storedFileInfo in fileLocations: 

957 if not self._artifact_exists(location): 957 ↛ 958line 957 didn't jump to line 958, because the condition on line 957 was never true

958 err_msg = f"Dataset is known to datastore {self.name} but " \ 

959 f"associated artifact ({location.uri}) is missing" 

960 if ignore_errors: 

961 log.warning(err_msg) 

962 return 

963 else: 

964 raise FileNotFoundError(err_msg) 

965 

966 # Mark dataset as trashed 

967 try: 

968 self._move_to_trash_in_registry(ref) 

969 except Exception as e: 

970 if ignore_errors: 

971 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

972 f"but encountered an error: {e}") 

973 pass 

974 else: 

975 raise 

976 

977 @transactional 

978 def emptyTrash(self, ignore_errors=True): 

979 """Remove all datasets from the trash. 

980 

981 Parameters 

982 ---------- 

983 ignore_errors : `bool` 

984 If `True` return without error even if something went wrong. 

985 Problems could occur if another process is simultaneously trying 

986 to delete. 

987 """ 

988 log.debug("Emptying trash in datastore %s", self.name) 

989 # Context manager will empty trash iff we finish it without raising. 

990 with self._bridge.emptyTrash() as trashed: 

991 for ref in trashed: 

992 fileLocations = self._get_dataset_locations_info(ref) 

993 

994 for location, _ in fileLocations: 

995 

996 if location is None: 996 ↛ 997line 996 didn't jump to line 997, because the condition on line 996 was never true

997 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

998 if ignore_errors: 

999 log.warning(err_msg) 

1000 continue 

1001 else: 

1002 raise FileNotFoundError(err_msg) 

1003 

1004 if not self._artifact_exists(location): 1004 ↛ 1005line 1004 didn't jump to line 1005, because the condition on line 1004 was never true

1005 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

1006 if ignore_errors: 

1007 log.warning(err_msg) 

1008 continue 

1009 else: 

1010 raise FileNotFoundError(err_msg) 

1011 

1012 # Can only delete the artifact if there are no references 

1013 # to the file from untrashed dataset refs. 

1014 if self._can_remove_dataset_artifact(ref, location): 

1015 # Point of no return for this artifact 

1016 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1017 try: 

1018 self._delete_artifact(location) 

1019 except Exception as e: 

1020 if ignore_errors: 

1021 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1022 location.uri, self.name, e) 

1023 else: 

1024 raise 

1025 

1026 # Now must remove the entry from the internal registry even if 

1027 # the artifact removal failed and was ignored, 

1028 # otherwise the removal check above will never be true 

1029 try: 

1030 # There may be multiple rows associated with this ref 

1031 # depending on disassembly 

1032 self.removeStoredItemInfo(ref) 

1033 except Exception as e: 

1034 if ignore_errors: 

1035 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1036 ref.id, location.uri, self.name, e) 

1037 continue 

1038 else: 

1039 raise FileNotFoundError(err_msg) 

1040 

1041 def validateConfiguration(self, entities, logFailures=False): 

1042 """Validate some of the configuration for this datastore. 

1043 

1044 Parameters 

1045 ---------- 

1046 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1047 Entities to test against this configuration. Can be differing 

1048 types. 

1049 logFailures : `bool`, optional 

1050 If `True`, output a log message for every validation error 

1051 detected. 

1052 

1053 Raises 

1054 ------ 

1055 DatastoreValidationError 

1056 Raised if there is a validation problem with a configuration. 

1057 All the problems are reported in a single exception. 

1058 

1059 Notes 

1060 ----- 

1061 This method checks that all the supplied entities have valid file 

1062 templates and also have formatters defined. 

1063 """ 

1064 

1065 templateFailed = None 

1066 try: 

1067 self.templates.validateTemplates(entities, logFailures=logFailures) 

1068 except FileTemplateValidationError as e: 

1069 templateFailed = str(e) 

1070 

1071 formatterFailed = [] 

1072 for entity in entities: 

1073 try: 

1074 self.formatterFactory.getFormatterClass(entity) 

1075 except KeyError as e: 

1076 formatterFailed.append(str(e)) 

1077 if logFailures: 1077 ↛ 1072line 1077 didn't jump to line 1072, because the condition on line 1077 was never false

1078 log.fatal("Formatter failure: %s", e) 

1079 

1080 if templateFailed or formatterFailed: 

1081 messages = [] 

1082 if templateFailed: 1082 ↛ 1083line 1082 didn't jump to line 1083, because the condition on line 1082 was never true

1083 messages.append(templateFailed) 

1084 if formatterFailed: 1084 ↛ 1086line 1084 didn't jump to line 1086, because the condition on line 1084 was never false

1085 messages.append(",".join(formatterFailed)) 

1086 msg = ";\n".join(messages) 

1087 raise DatastoreValidationError(msg) 

1088 

1089 def getLookupKeys(self): 

1090 # Docstring is inherited from base class 

1091 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1092 self.constraints.getLookupKeys() 

1093 

1094 def validateKey(self, lookupKey, entity): 

1095 # Docstring is inherited from base class 

1096 # The key can be valid in either formatters or templates so we can 

1097 # only check the template if it exists 

1098 if lookupKey in self.templates: 

1099 try: 

1100 self.templates[lookupKey].validateTemplate(entity) 

1101 except FileTemplateValidationError as e: 

1102 raise DatastoreValidationError(e) from e