Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Generic file-based datastore code.""" 

23 

24__all__ = ("FileLikeDatastore", ) 

25 

26import logging 

27from abc import abstractmethod 

28 

29from sqlalchemy import Integer, String 

30 

31from dataclasses import dataclass 

32from typing import Optional, List, Type 

33 

34from lsst.daf.butler import ( 

35 CompositesMap, 

36 Config, 

37 FileDataset, 

38 DatasetRef, 

39 DatasetType, 

40 DatasetTypeNotSupportedError, 

41 Datastore, 

42 DatastoreConfig, 

43 DatastoreValidationError, 

44 FakeDatasetRef, 

45 FileDescriptor, 

46 FileTemplates, 

47 FileTemplateValidationError, 

48 Formatter, 

49 FormatterFactory, 

50 Location, 

51 LocationFactory, 

52 StorageClass, 

53 StoredFileInfo, 

54) 

55 

56from lsst.daf.butler import ddl 

57from lsst.daf.butler.registry.interfaces import ReadOnlyDatabaseError 

58 

59from lsst.daf.butler.core.repoRelocation import replaceRoot 

60from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional 

61from .genericDatastore import GenericBaseDatastore 

62 

63log = logging.getLogger(__name__) 

64 

65 

66class _IngestPrepData(Datastore.IngestPrepData): 

67 """Helper class for FileLikeDatastore ingest implementation. 

68 

69 Parameters 

70 ---------- 

71 datasets : `list` of `FileDataset` 

72 Files to be ingested by this datastore. 

73 """ 

74 def __init__(self, datasets: List[FileDataset]): 

75 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

76 self.datasets = datasets 

77 

78 

79@dataclass(frozen=True) 

80class DatastoreFileGetInformation: 

81 """Collection of useful parameters needed to retrieve a file from 

82 a Datastore. 

83 """ 

84 

85 location: Location 

86 """The location from which to read the dataset.""" 

87 

88 formatter: Formatter 

89 """The `Formatter` to use to deserialize the dataset.""" 

90 

91 info: StoredFileInfo 

92 """Stored information about this file and its formatter.""" 

93 

94 assemblerParams: dict 

95 """Parameters to use for post-processing the retrieved dataset.""" 

96 

97 component: Optional[str] 

98 """The component to be retrieved (can be `None`).""" 

99 

100 readStorageClass: StorageClass 

101 """The `StorageClass` of the dataset being read.""" 

102 

103 

104class FileLikeDatastore(GenericBaseDatastore): 

105 """Generic Datastore for file-based implementations. 

106 

107 Should always be sub-classed since key abstract methods are missing. 

108 

109 Parameters 

110 ---------- 

111 config : `DatastoreConfig` or `str` 

112 Configuration as either a `Config` object or URI to file. 

113 

114 Raises 

115 ------ 

116 ValueError 

117 If root location does not exist and ``create`` is `False` in the 

118 configuration. 

119 """ 

120 

121 defaultConfigFile = None 

122 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

123 absolute path. Can be None if no defaults specified. 

124 """ 

125 

126 root: str 

127 """Root directory or URI of this `Datastore`.""" 

128 

129 locationFactory: LocationFactory 

130 """Factory for creating locations relative to the datastore root.""" 

131 

132 formatterFactory: FormatterFactory 

133 """Factory for creating instances of formatters.""" 

134 

135 templates: FileTemplates 

136 """File templates that can be used by this `Datastore`.""" 

137 

138 composites: CompositesMap 

139 """Determines whether a dataset should be disassembled on put.""" 

140 

141 @classmethod 

142 def setConfigRoot(cls, root, config, full, overwrite=True): 

143 """Set any filesystem-dependent config options for this Datastore to 

144 be appropriate for a new empty repository with the given root. 

145 

146 Parameters 

147 ---------- 

148 root : `str` 

149 URI to the root of the data repository. 

150 config : `Config` 

151 A `Config` to update. Only the subset understood by 

152 this component will be updated. Will not expand 

153 defaults. 

154 full : `Config` 

155 A complete config with all defaults expanded that can be 

156 converted to a `DatastoreConfig`. Read-only and will not be 

157 modified by this method. 

158 Repository-specific options that should not be obtained 

159 from defaults when Butler instances are constructed 

160 should be copied from ``full`` to ``config``. 

161 overwrite : `bool`, optional 

162 If `False`, do not modify a value in ``config`` if the value 

163 already exists. Default is always to overwrite with the provided 

164 ``root``. 

165 

166 Notes 

167 ----- 

168 If a keyword is explicitly defined in the supplied ``config`` it 

169 will not be overridden by this method if ``overwrite`` is `False`. 

170 This allows explicit values set in external configs to be retained. 

171 """ 

172 Config.updateParameters(DatastoreConfig, config, full, 

173 toUpdate={"root": root}, 

174 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

175 

176 @classmethod 

177 def makeTableSpec(cls): 

178 return ddl.TableSpec( 

179 fields=NamedValueSet([ 

180 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True), 

181 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

182 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

183 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

184 # Use empty string to indicate no component 

185 ddl.FieldSpec(name="component", dtype=String, length=16, primaryKey=True), 

186 # TODO: should checksum be Base64Bytes instead? 

187 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

188 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True), 

189 ]), 

190 unique=frozenset(), 

191 ) 

192 

193 def __init__(self, config, registry, butlerRoot=None): 

194 super().__init__(config, registry) 

195 if "root" not in self.config: 195 ↛ 196line 195 didn't jump to line 196, because the condition on line 195 was never true

196 raise ValueError("No root directory specified in configuration") 

197 

198 # Name ourselves either using an explicit name or a name 

199 # derived from the (unexpanded) root 

200 if "name" in self.config: 

201 self.name = self.config["name"] 

202 else: 

203 # We use the unexpanded root in the name to indicate that this 

204 # datastore can be moved without having to update registry. 

205 self.name = "{}@{}".format(type(self).__name__, 

206 self.config["root"]) 

207 

208 # Support repository relocation in config 

209 # Existence of self.root is checked in subclass 

210 self.root = replaceRoot(self.config["root"], butlerRoot) 

211 

212 self.locationFactory = LocationFactory(self.root) 

213 self.formatterFactory = FormatterFactory() 

214 

215 # Now associate formatters with storage classes 

216 self.formatterFactory.registerFormatters(self.config["formatters"], 

217 universe=self.registry.dimensions) 

218 

219 # Read the file naming templates 

220 self.templates = FileTemplates(self.config["templates"], 

221 universe=self.registry.dimensions) 

222 

223 # See if composites should be disassembled 

224 self.composites = CompositesMap(self.config["composites"], 

225 universe=self.registry.dimensions) 

226 

227 # Storage of paths and formatters, keyed by dataset_id 

228 self._tableName = self.config["records", "table"] 

229 try: 

230 registry.registerOpaqueTable(self._tableName, self.makeTableSpec()) 

231 except ReadOnlyDatabaseError: 

232 # If the database is read only and we just tried and failed to 

233 # create a table, it means someone is trying to create a read-only 

234 # butler client for an empty repo. That should be okay, as long 

235 # as they then try to get any datasets before some other client 

236 # creates the table. Chances are they'rejust validating 

237 # configuration. 

238 pass 

239 

240 # Determine whether checksums should be used 

241 self.useChecksum = self.config.get("checksum", True) 

242 

243 def __str__(self): 

244 return self.root 

245 

246 @abstractmethod 

247 def _artifact_exists(self, location): 

248 """Check that an artifact exists in this datastore at the specified 

249 location. 

250 

251 Parameters 

252 ---------- 

253 location : `Location` 

254 Expected location of the artifact associated with this datastore. 

255 

256 Returns 

257 ------- 

258 exists : `bool` 

259 True if the location can be found, false otherwise. 

260 """ 

261 raise NotImplementedError() 

262 

263 @abstractmethod 

264 def _delete_artifact(self, location): 

265 """Delete the artifact from the datastore. 

266 

267 Parameters 

268 ---------- 

269 location : `Location` 

270 Location of the artifact associated with this datastore. 

271 """ 

272 raise NotImplementedError() 

273 

274 def addStoredItemInfo(self, refs, infos): 

275 # Docstring inherited from GenericBaseDatastore 

276 records = [] 

277 for ref, info in zip(refs, infos): 

278 # Component should come from ref and fall back on info 

279 component = ref.datasetType.component() 

280 if component is None and info.component is not None: 280 ↛ 281line 280 didn't jump to line 281, because the condition on line 280 was never true

281 component = info.component 

282 if component is None: 

283 # Use empty string since we want this to be part of the 

284 # primary key. 

285 component = "" 

286 records.append( 

287 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

288 storage_class=info.storageClass.name, component=component, 

289 checksum=info.checksum, file_size=info.file_size) 

290 ) 

291 self.registry.insertOpaqueData(self._tableName, *records) 

292 

293 def getStoredItemInfo(self, ref): 

294 # Docstring inherited from GenericBaseDatastore 

295 

296 where = {"dataset_id": ref.id} 

297 

298 # If we have no component we want the row from this table without 

299 # a component. If we do have a component we either need the row 

300 # with no component or the row with the component, depending on how 

301 # this dataset was dissassembled. 

302 

303 # if we are emptying trash we won't have real refs so can't constrain 

304 # by component. Will need to fix this to return multiple matches 

305 # in future. 

306 try: 

307 component = ref.datasetType.component() 

308 except AttributeError: 

309 component = None 

310 else: 

311 if component is None: 

312 where["component"] = "" 

313 

314 # Look for the dataset_id -- there might be multiple matches 

315 # if we have disassembled the dataset. 

316 records = list(self.registry.fetchOpaqueData(self._tableName, **where)) 

317 if len(records) == 0: 317 ↛ 318line 317 didn't jump to line 318, because the condition on line 317 was never true

318 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.") 

319 

320 # if we are not asking for a component 

321 if not component and len(records) != 1: 321 ↛ 322line 321 didn't jump to line 322, because the condition on line 321 was never true

322 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}") 

323 

324 # if we had a FakeDatasetRef we pick the first record regardless 

325 if isinstance(ref, FakeDatasetRef): 325 ↛ 326line 325 didn't jump to line 326, because the condition on line 325 was never true

326 record = records[0] 

327 else: 

328 records_by_component = {} 

329 for r in records: 

330 this_component = r["component"] if r["component"] else None 

331 records_by_component[this_component] = r 

332 

333 # Look for component by name else fall back to the parent 

334 for lookup in (component, None): 334 ↛ 339line 334 didn't jump to line 339, because the loop on line 334 didn't complete

335 if lookup in records_by_component: 335 ↛ 334line 335 didn't jump to line 334, because the condition on line 335 was never false

336 record = records_by_component[lookup] 

337 break 

338 else: 

339 raise KeyError(f"Unable to retrieve location for component {component} associated with " 

340 f"dataset {ref}.") 

341 

342 # Convert name of StorageClass to instance 

343 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

344 

345 return StoredFileInfo(formatter=record["formatter"], 

346 path=record["path"], 

347 storageClass=storageClass, 

348 component=component, 

349 checksum=record["checksum"], 

350 file_size=record["file_size"]) 

351 

352 def getStoredItemsInfo(self, ref): 

353 # Docstring inherited from GenericBaseDatastore 

354 

355 # Look for the dataset_id -- there might be multiple matches 

356 # if we have disassembled the dataset. 

357 records = list(self.registry.fetchOpaqueData(self._tableName, dataset_id=ref.id)) 

358 

359 results = [] 

360 for record in records: 

361 # Convert name of StorageClass to instance 

362 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

363 component = record["component"] if record["component"] else None 

364 

365 info = StoredFileInfo(formatter=record["formatter"], 

366 path=record["path"], 

367 storageClass=storageClass, 

368 component=component, 

369 checksum=record["checksum"], 

370 file_size=record["file_size"]) 

371 results.append(info) 

372 

373 return results 

374 

375 def _registered_refs_per_artifact(self, pathInStore): 

376 """Return all dataset refs associated with the supplied path. 

377 

378 Parameters 

379 ---------- 

380 pathInStore : `str` 

381 Path of interest in the data store. 

382 

383 Returns 

384 ------- 

385 ids : `set` of `int` 

386 All `DatasetRef` IDs associated with this path. 

387 """ 

388 records = list(self.registry.fetchOpaqueData(self._tableName, path=pathInStore)) 

389 ids = {r["dataset_id"] for r in records} 

390 return ids 

391 

392 def removeStoredItemInfo(self, ref): 

393 # Docstring inherited from GenericBaseDatastore 

394 self.registry.deleteOpaqueData(self._tableName, dataset_id=ref.id) 

395 

396 def _get_dataset_location_info(self, ref): 

397 """Find the `Location` of the requested dataset in the 

398 `Datastore` and the associated stored file information. 

399 

400 Parameters 

401 ---------- 

402 ref : `DatasetRef` 

403 Reference to the required `Dataset`. 

404 

405 Returns 

406 ------- 

407 location : `Location` 

408 Location of the dataset within the datastore. 

409 Returns `None` if the dataset can not be located. 

410 info : `StoredFileInfo` 

411 Stored information about this file and its formatter. 

412 """ 

413 # Get the file information (this will fail if no file) 

414 try: 

415 storedFileInfo = self.getStoredItemInfo(ref) 

416 except KeyError: 

417 return None, None 

418 

419 # Use the path to determine the location 

420 location = self.locationFactory.fromPath(storedFileInfo.path) 

421 

422 return location, storedFileInfo 

423 

424 def _get_dataset_locations_info(self, ref): 

425 r"""Find all the `Location`\ s of the requested dataset in the 

426 `Datastore` and the associated stored file information. 

427 

428 Parameters 

429 ---------- 

430 ref : `DatasetRef` 

431 Reference to the required `Dataset`. 

432 

433 Returns 

434 ------- 

435 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

436 Location of the dataset within the datastore and 

437 stored information about each file and its formatter. 

438 """ 

439 # Get the file information (this will fail if no file) 

440 records = self.getStoredItemsInfo(ref) 

441 

442 # Use the path to determine the location 

443 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

444 

445 def _can_remove_dataset_artifact(self, ref, location): 

446 """Check that there is only one dataset associated with the 

447 specified artifact. 

448 

449 Parameters 

450 ---------- 

451 ref : `DatasetRef` 

452 Dataset to be removed. 

453 location : `Location` 

454 The location of the artifact to be removed. 

455 

456 Returns 

457 ------- 

458 can_remove : `Bool` 

459 True if the artifact can be safely removed. 

460 """ 

461 

462 # Get all entries associated with this path 

463 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

464 if not allRefs: 464 ↛ 465line 464 didn't jump to line 465, because the condition on line 464 was never true

465 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

466 

467 # Get all the refs associated with this dataset if it is a composite 

468 theseRefs = {r.id for r in ref.flatten([ref])} 

469 

470 # Remove these refs from all the refs and if there is nothing left 

471 # then we can delete 

472 remainingRefs = allRefs - theseRefs 

473 

474 if remainingRefs: 

475 return False 

476 return True 

477 

478 def _prepare_for_get(self, ref, parameters=None): 

479 """Check parameters for ``get`` and obtain formatter and 

480 location. 

481 

482 Parameters 

483 ---------- 

484 ref : `DatasetRef` 

485 Reference to the required Dataset. 

486 parameters : `dict` 

487 `StorageClass`-specific parameters that specify, for example, 

488 a slice of the dataset to be loaded. 

489 

490 Returns 

491 ------- 

492 getInfo : `list` [`DatastoreFileGetInformation`] 

493 Parameters needed to retrieve each file. 

494 """ 

495 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

496 

497 # Get file metadata and internal metadata 

498 fileLocations = self._get_dataset_locations_info(ref) 

499 if not fileLocations: 

500 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

501 

502 # The storage class we want to use eventually 

503 refStorageClass = ref.datasetType.storageClass 

504 

505 # Check that the supplied parameters are suitable for the type read 

506 refStorageClass.validateParameters(parameters) 

507 

508 if len(fileLocations) > 1: 

509 disassembled = True 

510 else: 

511 disassembled = False 

512 

513 # Is this a component request? 

514 refComponent = ref.datasetType.component() 

515 

516 fileGetInfo = [] 

517 for location, storedFileInfo in fileLocations: 

518 

519 # The storage class used to write the file 

520 writeStorageClass = storedFileInfo.storageClass 

521 

522 # If this has been disassembled we need read to match the write 

523 if disassembled: 

524 readStorageClass = writeStorageClass 

525 else: 

526 readStorageClass = refStorageClass 

527 

528 formatter = getInstanceOf(storedFileInfo.formatter, 

529 FileDescriptor(location, readStorageClass=readStorageClass, 

530 storageClass=writeStorageClass, parameters=parameters), 

531 ref.dataId) 

532 

533 _, notFormatterParams = formatter.segregateParameters() 

534 

535 # Of the remaining parameters, extract the ones supported by 

536 # this StorageClass (for components not all will be handled) 

537 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

538 

539 # The ref itself could be a component if the dataset was 

540 # disassembled by butler, or we disassembled in datastore and 

541 # components came from the datastore records 

542 if storedFileInfo.component: 

543 component = storedFileInfo.component 

544 else: 

545 component = refComponent 

546 

547 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

548 assemblerParams, component, readStorageClass)) 

549 

550 return fileGetInfo 

551 

552 def _prepare_for_put(self, inMemoryDataset, ref): 

553 """Check the arguments for ``put`` and obtain formatter and 

554 location. 

555 

556 Parameters 

557 ---------- 

558 inMemoryDataset : `object` 

559 The dataset to store. 

560 ref : `DatasetRef` 

561 Reference to the associated Dataset. 

562 

563 Returns 

564 ------- 

565 location : `Location` 

566 The location to write the dataset. 

567 formatter : `Formatter` 

568 The `Formatter` to use to write the dataset. 

569 

570 Raises 

571 ------ 

572 TypeError 

573 Supplied object and storage class are inconsistent. 

574 DatasetTypeNotSupportedError 

575 The associated `DatasetType` is not handled by this datastore. 

576 """ 

577 self._validate_put_parameters(inMemoryDataset, ref) 

578 

579 # Work out output file name 

580 try: 

581 template = self.templates.getTemplate(ref) 

582 except KeyError as e: 

583 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

584 

585 location = self.locationFactory.fromPath(template.format(ref)) 

586 

587 # Get the formatter based on the storage class 

588 storageClass = ref.datasetType.storageClass 

589 try: 

590 formatter = self.formatterFactory.getFormatter(ref, 

591 FileDescriptor(location, 

592 storageClass=storageClass), 

593 ref.dataId) 

594 except KeyError as e: 

595 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e 

596 

597 return location, formatter 

598 

599 @abstractmethod 

600 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

601 """Standardize the path of a to-be-ingested file. 

602 

603 Parameters 

604 ---------- 

605 path : `str` 

606 Path of a file to be ingested. 

607 transfer : `str`, optional 

608 How (and whether) the dataset should be added to the datastore. 

609 See `ingest` for details of transfer modes. 

610 This implementation is provided only so 

611 `NotImplementedError` can be raised if the mode is not supported; 

612 actual transfers are deferred to `_extractIngestInfo`. 

613 

614 Returns 

615 ------- 

616 path : `str` 

617 New path in what the datastore considers standard form. 

618 

619 Notes 

620 ----- 

621 Subclasses of `FileLikeDatastore` should implement this method instead 

622 of `_prepIngest`. It should not modify the data repository or given 

623 file in any way. 

624 

625 Raises 

626 ------ 

627 NotImplementedError 

628 Raised if the datastore does not support the given transfer mode 

629 (including the case where ingest is not supported at all). 

630 FileNotFoundError 

631 Raised if one of the given files does not exist. 

632 """ 

633 raise NotImplementedError("Must be implemented by subclasses.") 

634 

635 @abstractmethod 

636 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

637 transfer: Optional[str] = None) -> StoredFileInfo: 

638 """Relocate (if necessary) and extract `StoredFileInfo` from a 

639 to-be-ingested file. 

640 

641 Parameters 

642 ---------- 

643 path : `str` 

644 Path of a file to be ingested. 

645 ref : `DatasetRef` 

646 Reference for the dataset being ingested. Guaranteed to have 

647 ``dataset_id not None`. 

648 formatter : `type` 

649 `Formatter` subclass to use for this dataset. 

650 transfer : `str`, optional 

651 How (and whether) the dataset should be added to the datastore. 

652 See `ingest` for details of transfer modes. 

653 

654 Returns 

655 ------- 

656 info : `StoredFileInfo` 

657 Internal datastore record for this file. This will be inserted by 

658 the caller; the `_extractIngestInfo` is only resposible for 

659 creating and populating the struct. 

660 

661 Raises 

662 ------ 

663 FileNotFoundError 

664 Raised if one of the given files does not exist. 

665 FileExistsError 

666 Raised if transfer is not `None` but the (internal) location the 

667 file would be moved to is already occupied. 

668 """ 

669 raise NotImplementedError("Must be implemented by subclasses.") 

670 

671 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

672 # Docstring inherited from Datastore._prepIngest. 

673 filtered = [] 

674 for dataset in datasets: 

675 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

676 if not acceptable: 

677 continue 

678 else: 

679 dataset.refs = acceptable 

680 if dataset.formatter is None: 

681 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

682 else: 

683 dataset.formatter = getClassOf(dataset.formatter) 

684 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

685 filtered.append(dataset) 

686 return _IngestPrepData(filtered) 

687 

688 @transactional 

689 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None): 

690 # Docstring inherited from Datastore._finishIngest. 

691 refsAndInfos = [] 

692 for dataset in prepData.datasets: 

693 # Do ingest as if the first dataset ref is associated with the file 

694 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

695 transfer=transfer) 

696 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

697 self._register_datasets(refsAndInfos) 

698 

699 def exists(self, ref): 

700 """Check if the dataset exists in the datastore. 

701 

702 Parameters 

703 ---------- 

704 ref : `DatasetRef` 

705 Reference to the required dataset. 

706 

707 Returns 

708 ------- 

709 exists : `bool` 

710 `True` if the entity exists in the `Datastore`. 

711 """ 

712 fileLocations = self._get_dataset_locations_info(ref) 

713 if not fileLocations: 

714 return False 

715 for location, _ in fileLocations: 

716 if not self._artifact_exists(location): 

717 return False 

718 

719 return True 

720 

721 def getUri(self, ref, predict=False): 

722 """URI to the Dataset. 

723 

724 Parameters 

725 ---------- 

726 ref : `DatasetRef` 

727 Reference to the required Dataset. 

728 predict : `bool` 

729 If `True`, allow URIs to be returned of datasets that have not 

730 been written. 

731 

732 Returns 

733 ------- 

734 uri : `str` 

735 URI string pointing to the dataset within the datastore. If the 

736 dataset does not exist in the datastore, and if ``predict`` is 

737 `True`, the URI will be a prediction and will include a URI 

738 fragment "#predicted". 

739 If the datastore does not have entities that relate well 

740 to the concept of a URI the returned URI string will be 

741 descriptive. The returned URI is not guaranteed to be obtainable. 

742 

743 Raises 

744 ------ 

745 FileNotFoundError 

746 A URI has been requested for a dataset that does not exist and 

747 guessing is not allowed. 

748 

749 Notes 

750 ----- 

751 When a predicted URI is requested an attempt will be made to form 

752 a reasonable URI based on file templates and the expected formatter. 

753 """ 

754 # if this has never been written then we have to guess 

755 if not self.exists(ref): 

756 if not predict: 

757 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

758 

759 template = self.templates.getTemplate(ref) 

760 location = self.locationFactory.fromPath(template.format(ref)) 

761 storageClass = ref.datasetType.storageClass 

762 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location, 

763 storageClass=storageClass)) 

764 # Try to use the extension attribute but ignore problems if the 

765 # formatter does not define one. 

766 try: 

767 location = formatter.makeUpdatedLocation(location) 

768 except Exception: 

769 # Use the default extension 

770 pass 

771 

772 # Add a URI fragment to indicate this is a guess 

773 return location.uri + "#predicted" 

774 

775 # If this is a ref that we have written we can get the path. 

776 # Get file metadata and internal metadata 

777 storedFileInfo = self.getStoredItemInfo(ref) 

778 

779 # Use the path to determine the location 

780 location = self.locationFactory.fromPath(storedFileInfo.path) 

781 

782 return location.uri 

783 

784 def get(self, ref, parameters=None): 

785 """Load an InMemoryDataset from the store. 

786 

787 Parameters 

788 ---------- 

789 ref : `DatasetRef` 

790 Reference to the required Dataset. 

791 parameters : `dict` 

792 `StorageClass`-specific parameters that specify, for example, 

793 a slice of the dataset to be loaded. 

794 

795 Returns 

796 ------- 

797 inMemoryDataset : `object` 

798 Requested dataset or slice thereof as an InMemoryDataset. 

799 

800 Raises 

801 ------ 

802 FileNotFoundError 

803 Requested dataset can not be retrieved. 

804 TypeError 

805 Return value from formatter has unexpected type. 

806 ValueError 

807 Formatter failed to process the dataset. 

808 """ 

809 allGetInfo = self._prepare_for_get(ref, parameters) 

810 refComponent = ref.datasetType.component() 

811 

812 if len(allGetInfo) > 1 and not refComponent: 

813 # This was a disassembled dataset spread over multiple files 

814 # and we need to put them all back together again. 

815 # Read into memory and then assemble 

816 usedParams = set() 

817 components = {} 

818 for getInfo in allGetInfo: 

819 # assemblerParams are parameters not understood by the 

820 # associated formatter. 

821 usedParams.update(set(getInfo.assemblerParams)) 

822 

823 component = getInfo.component 

824 # We do not want the formatter to think it's reading 

825 # a component though because it is really reading a 

826 # standalone dataset -- always tell reader it is not a 

827 # component. 

828 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

829 

830 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

831 

832 # Any unused parameters will have to be passed to the assembler 

833 if parameters: 

834 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

835 else: 

836 unusedParams = {} 

837 

838 # Process parameters 

839 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

840 parameters=unusedParams) 

841 

842 else: 

843 # Single file request or component from that composite file 

844 allComponents = {i.component: i for i in allGetInfo} 

845 for lookup in (refComponent, None): 845 ↛ 850line 845 didn't jump to line 850, because the loop on line 845 didn't complete

846 if lookup in allComponents: 846 ↛ 845line 846 didn't jump to line 845, because the condition on line 846 was never false

847 getInfo = allComponents[lookup] 

848 break 

849 else: 

850 raise FileNotFoundError(f"Component {refComponent} not found " 

851 f"for ref {ref} in datastore {self.name}") 

852 

853 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None) 

854 

855 @transactional 

856 def put(self, inMemoryDataset, ref): 

857 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

858 

859 Parameters 

860 ---------- 

861 inMemoryDataset : `object` 

862 The dataset to store. 

863 ref : `DatasetRef` 

864 Reference to the associated Dataset. 

865 

866 Raises 

867 ------ 

868 TypeError 

869 Supplied object and storage class are inconsistent. 

870 DatasetTypeNotSupportedError 

871 The associated `DatasetType` is not handled by this datastore. 

872 

873 Notes 

874 ----- 

875 If the datastore is configured to reject certain dataset types it 

876 is possible that the put will fail and raise a 

877 `DatasetTypeNotSupportedError`. The main use case for this is to 

878 allow `ChainedDatastore` to put to multiple datastores without 

879 requiring that every datastore accepts the dataset. 

880 """ 

881 

882 doDisassembly = self.composites.shouldBeDisassembled(ref) 

883 # doDisassembly = True 

884 

885 artifacts = [] 

886 if doDisassembly: 

887 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset) 

888 for component, componentInfo in components.items(): 

889 compTypeName = ref.datasetType.componentTypeName(component) 

890 # Don't recurse because we want to take advantage of 

891 # bulk insert -- need a new DatasetRef that refers to the 

892 # same dataset_id but has the component DatasetType 

893 # DatasetType does not refer to the types of components 

894 # So we construct one ourselves. 

895 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions, 

896 storageClass=componentInfo.storageClass) 

897 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False) 

898 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

899 artifacts.append((compRef, storedInfo)) 

900 else: 

901 # Write the entire thing out 

902 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

903 artifacts.append((ref, storedInfo)) 

904 

905 self._register_datasets(artifacts) 

906 

907 @transactional 

908 def trash(self, ref, ignore_errors=True): 

909 """Indicate to the datastore that a dataset can be removed. 

910 

911 Parameters 

912 ---------- 

913 ref : `DatasetRef` 

914 Reference to the required Dataset. 

915 ignore_errors : `bool` 

916 If `True` return without error even if something went wrong. 

917 Problems could occur if another process is simultaneously trying 

918 to delete. 

919 

920 Raises 

921 ------ 

922 FileNotFoundError 

923 Attempt to remove a dataset that does not exist. 

924 """ 

925 # Get file metadata and internal metadata 

926 log.debug("Trashing %s in datastore %s", ref, self.name) 

927 

928 fileLocations = self._get_dataset_locations_info(ref) 

929 

930 if not fileLocations: 

931 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

932 if ignore_errors: 

933 log.warning(err_msg) 

934 return 

935 else: 

936 raise FileNotFoundError(err_msg) 

937 

938 for location, storedFileInfo in fileLocations: 

939 if not self._artifact_exists(location): 939 ↛ 940line 939 didn't jump to line 940, because the condition on line 939 was never true

940 err_msg = f"Dataset is known to datastore {self.name} but " \ 

941 f"associated artifact ({location.uri}) is missing" 

942 if ignore_errors: 

943 log.warning(err_msg) 

944 return 

945 else: 

946 raise FileNotFoundError(err_msg) 

947 

948 # Mark dataset as trashed 

949 try: 

950 self._move_to_trash_in_registry(ref) 

951 except Exception as e: 

952 if ignore_errors: 

953 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

954 f"but encountered an error: {e}") 

955 pass 

956 else: 

957 raise 

958 

959 @transactional 

960 def emptyTrash(self, ignore_errors=True): 

961 """Remove all datasets from the trash. 

962 

963 Parameters 

964 ---------- 

965 ignore_errors : `bool` 

966 If `True` return without error even if something went wrong. 

967 Problems could occur if another process is simultaneously trying 

968 to delete. 

969 """ 

970 log.debug("Emptying trash in datastore %s", self.name) 

971 trashed = self.registry.getTrashedDatasets(self.name) 

972 

973 for ref in trashed: 

974 fileLocations = self._get_dataset_locations_info(ref) 

975 

976 for location, _ in fileLocations: 

977 

978 if location is None: 978 ↛ 979line 978 didn't jump to line 979, because the condition on line 978 was never true

979 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

980 if ignore_errors: 

981 log.warning(err_msg) 

982 continue 

983 else: 

984 raise FileNotFoundError(err_msg) 

985 

986 if not self._artifact_exists(location): 986 ↛ 987line 986 didn't jump to line 987, because the condition on line 986 was never true

987 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

988 if ignore_errors: 

989 log.warning(err_msg) 

990 continue 

991 else: 

992 raise FileNotFoundError(err_msg) 

993 

994 # Can only delete the artifact if there are no references 

995 # to the file from untrashed dataset refs. 

996 if self._can_remove_dataset_artifact(ref, location): 

997 # Point of no return for this artifact 

998 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

999 try: 

1000 self._delete_artifact(location) 

1001 except Exception as e: 

1002 if ignore_errors: 

1003 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1004 location.uri, self.name, e) 

1005 else: 

1006 raise 

1007 

1008 # Now must remove the entry from the internal registry even if 

1009 # the artifact removal failed and was ignored, 

1010 # otherwise the removal check above will never be true 

1011 try: 

1012 # There may be multiple rows associated with this ref 

1013 # depending on disassembly 

1014 self.removeStoredItemInfo(ref) 

1015 except Exception as e: 

1016 if ignore_errors: 

1017 log.warning(f"Error removing dataset %s (%s) from internal registry of %s: %s", 

1018 ref.id, location.uri, self.name, e) 

1019 continue 

1020 else: 

1021 raise 

1022 

1023 # Inform registry that we have removed items from datastore 

1024 # This should work even if another process is clearing out those rows 

1025 self.registry.emptyDatasetLocationsTrash(self.name, trashed) 

1026 

1027 def validateConfiguration(self, entities, logFailures=False): 

1028 """Validate some of the configuration for this datastore. 

1029 

1030 Parameters 

1031 ---------- 

1032 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1033 Entities to test against this configuration. Can be differing 

1034 types. 

1035 logFailures : `bool`, optional 

1036 If `True`, output a log message for every validation error 

1037 detected. 

1038 

1039 Raises 

1040 ------ 

1041 DatastoreValidationError 

1042 Raised if there is a validation problem with a configuration. 

1043 All the problems are reported in a single exception. 

1044 

1045 Notes 

1046 ----- 

1047 This method checks that all the supplied entities have valid file 

1048 templates and also have formatters defined. 

1049 """ 

1050 

1051 templateFailed = None 

1052 try: 

1053 self.templates.validateTemplates(entities, logFailures=logFailures) 

1054 except FileTemplateValidationError as e: 

1055 templateFailed = str(e) 

1056 

1057 formatterFailed = [] 

1058 for entity in entities: 

1059 try: 

1060 self.formatterFactory.getFormatterClass(entity) 

1061 except KeyError as e: 

1062 formatterFailed.append(str(e)) 

1063 if logFailures: 1063 ↛ 1058line 1063 didn't jump to line 1058, because the condition on line 1063 was never false

1064 log.fatal("Formatter failure: %s", e) 

1065 

1066 if templateFailed or formatterFailed: 

1067 messages = [] 

1068 if templateFailed: 1068 ↛ 1069line 1068 didn't jump to line 1069, because the condition on line 1068 was never true

1069 messages.append(templateFailed) 

1070 if formatterFailed: 1070 ↛ 1072line 1070 didn't jump to line 1072, because the condition on line 1070 was never false

1071 messages.append(",".join(formatterFailed)) 

1072 msg = ";\n".join(messages) 

1073 raise DatastoreValidationError(msg) 

1074 

1075 def getLookupKeys(self): 

1076 # Docstring is inherited from base class 

1077 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1078 self.constraints.getLookupKeys() 

1079 

1080 def validateKey(self, lookupKey, entity): 

1081 # Docstring is inherited from base class 

1082 # The key can be valid in either formatters or templates so we can 

1083 # only check the template if it exists 

1084 if lookupKey in self.templates: 

1085 try: 

1086 self.templates[lookupKey].validateTemplate(entity) 

1087 except FileTemplateValidationError as e: 

1088 raise DatastoreValidationError(e) from e