Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Generic file-based datastore code.""" 

23 

24__all__ = ("FileLikeDatastore", ) 

25 

26import logging 

27from abc import abstractmethod 

28 

29from sqlalchemy import Integer, String 

30 

31from dataclasses import dataclass 

32from typing import Optional, List, Type 

33 

34from lsst.daf.butler import ( 

35 CompositesMap, 

36 Config, 

37 FileDataset, 

38 DatasetRef, 

39 DatasetType, 

40 DatasetTypeNotSupportedError, 

41 Datastore, 

42 DatastoreConfig, 

43 DatastoreValidationError, 

44 FakeDatasetRef, 

45 FileDescriptor, 

46 FileTemplates, 

47 FileTemplateValidationError, 

48 Formatter, 

49 FormatterFactory, 

50 Location, 

51 LocationFactory, 

52 StorageClass, 

53 StoredFileInfo, 

54) 

55 

56from lsst.daf.butler import ddl 

57from lsst.daf.butler.registry.interfaces import ReadOnlyDatabaseError 

58 

59from lsst.daf.butler.core.repoRelocation import replaceRoot 

60from lsst.daf.butler.core.utils import getInstanceOf, NamedValueSet, getClassOf, transactional 

61from .genericDatastore import GenericBaseDatastore 

62 

63log = logging.getLogger(__name__) 

64 

65# String to use when a Python None is encountered 

66NULLSTR = "__NULL_STRING__" 

67 

68 

69class _IngestPrepData(Datastore.IngestPrepData): 

70 """Helper class for FileLikeDatastore ingest implementation. 

71 

72 Parameters 

73 ---------- 

74 datasets : `list` of `FileDataset` 

75 Files to be ingested by this datastore. 

76 """ 

77 def __init__(self, datasets: List[FileDataset]): 

78 super().__init__(ref for dataset in datasets for ref in dataset.refs) 

79 self.datasets = datasets 

80 

81 

82@dataclass(frozen=True) 

83class DatastoreFileGetInformation: 

84 """Collection of useful parameters needed to retrieve a file from 

85 a Datastore. 

86 """ 

87 

88 location: Location 

89 """The location from which to read the dataset.""" 

90 

91 formatter: Formatter 

92 """The `Formatter` to use to deserialize the dataset.""" 

93 

94 info: StoredFileInfo 

95 """Stored information about this file and its formatter.""" 

96 

97 assemblerParams: dict 

98 """Parameters to use for post-processing the retrieved dataset.""" 

99 

100 component: Optional[str] 

101 """The component to be retrieved (can be `None`).""" 

102 

103 readStorageClass: StorageClass 

104 """The `StorageClass` of the dataset being read.""" 

105 

106 

107class FileLikeDatastore(GenericBaseDatastore): 

108 """Generic Datastore for file-based implementations. 

109 

110 Should always be sub-classed since key abstract methods are missing. 

111 

112 Parameters 

113 ---------- 

114 config : `DatastoreConfig` or `str` 

115 Configuration as either a `Config` object or URI to file. 

116 

117 Raises 

118 ------ 

119 ValueError 

120 If root location does not exist and ``create`` is `False` in the 

121 configuration. 

122 """ 

123 

124 defaultConfigFile = None 

125 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

126 absolute path. Can be None if no defaults specified. 

127 """ 

128 

129 root: str 

130 """Root directory or URI of this `Datastore`.""" 

131 

132 locationFactory: LocationFactory 

133 """Factory for creating locations relative to the datastore root.""" 

134 

135 formatterFactory: FormatterFactory 

136 """Factory for creating instances of formatters.""" 

137 

138 templates: FileTemplates 

139 """File templates that can be used by this `Datastore`.""" 

140 

141 composites: CompositesMap 

142 """Determines whether a dataset should be disassembled on put.""" 

143 

144 @classmethod 

145 def setConfigRoot(cls, root, config, full, overwrite=True): 

146 """Set any filesystem-dependent config options for this Datastore to 

147 be appropriate for a new empty repository with the given root. 

148 

149 Parameters 

150 ---------- 

151 root : `str` 

152 URI to the root of the data repository. 

153 config : `Config` 

154 A `Config` to update. Only the subset understood by 

155 this component will be updated. Will not expand 

156 defaults. 

157 full : `Config` 

158 A complete config with all defaults expanded that can be 

159 converted to a `DatastoreConfig`. Read-only and will not be 

160 modified by this method. 

161 Repository-specific options that should not be obtained 

162 from defaults when Butler instances are constructed 

163 should be copied from ``full`` to ``config``. 

164 overwrite : `bool`, optional 

165 If `False`, do not modify a value in ``config`` if the value 

166 already exists. Default is always to overwrite with the provided 

167 ``root``. 

168 

169 Notes 

170 ----- 

171 If a keyword is explicitly defined in the supplied ``config`` it 

172 will not be overridden by this method if ``overwrite`` is `False`. 

173 This allows explicit values set in external configs to be retained. 

174 """ 

175 Config.updateParameters(DatastoreConfig, config, full, 

176 toUpdate={"root": root}, 

177 toCopy=("cls", ("records", "table")), overwrite=overwrite) 

178 

179 @classmethod 

180 def makeTableSpec(cls): 

181 return ddl.TableSpec( 

182 fields=NamedValueSet([ 

183 ddl.FieldSpec(name="dataset_id", dtype=Integer, primaryKey=True), 

184 ddl.FieldSpec(name="path", dtype=String, length=256, nullable=False), 

185 ddl.FieldSpec(name="formatter", dtype=String, length=128, nullable=False), 

186 ddl.FieldSpec(name="storage_class", dtype=String, length=64, nullable=False), 

187 # Use empty string to indicate no component 

188 ddl.FieldSpec(name="component", dtype=String, length=32, primaryKey=True), 

189 # TODO: should checksum be Base64Bytes instead? 

190 ddl.FieldSpec(name="checksum", dtype=String, length=128, nullable=True), 

191 ddl.FieldSpec(name="file_size", dtype=Integer, nullable=True), 

192 ]), 

193 unique=frozenset(), 

194 ) 

195 

196 def __init__(self, config, registry, butlerRoot=None): 

197 super().__init__(config, registry) 

198 if "root" not in self.config: 198 ↛ 199line 198 didn't jump to line 199, because the condition on line 198 was never true

199 raise ValueError("No root directory specified in configuration") 

200 

201 # Name ourselves either using an explicit name or a name 

202 # derived from the (unexpanded) root 

203 if "name" in self.config: 

204 self.name = self.config["name"] 

205 else: 

206 # We use the unexpanded root in the name to indicate that this 

207 # datastore can be moved without having to update registry. 

208 self.name = "{}@{}".format(type(self).__name__, 

209 self.config["root"]) 

210 

211 # Support repository relocation in config 

212 # Existence of self.root is checked in subclass 

213 self.root = replaceRoot(self.config["root"], butlerRoot) 

214 

215 self.locationFactory = LocationFactory(self.root) 

216 self.formatterFactory = FormatterFactory() 

217 

218 # Now associate formatters with storage classes 

219 self.formatterFactory.registerFormatters(self.config["formatters"], 

220 universe=self.registry.dimensions) 

221 

222 # Read the file naming templates 

223 self.templates = FileTemplates(self.config["templates"], 

224 universe=self.registry.dimensions) 

225 

226 # See if composites should be disassembled 

227 self.composites = CompositesMap(self.config["composites"], 

228 universe=self.registry.dimensions) 

229 

230 # Storage of paths and formatters, keyed by dataset_id 

231 self._tableName = self.config["records", "table"] 

232 try: 

233 registry.registerOpaqueTable(self._tableName, self.makeTableSpec()) 

234 except ReadOnlyDatabaseError: 

235 # If the database is read only and we just tried and failed to 

236 # create a table, it means someone is trying to create a read-only 

237 # butler client for an empty repo. That should be okay, as long 

238 # as they then try to get any datasets before some other client 

239 # creates the table. Chances are they'rejust validating 

240 # configuration. 

241 pass 

242 

243 # Determine whether checksums should be used 

244 self.useChecksum = self.config.get("checksum", True) 

245 

246 def __str__(self): 

247 return self.root 

248 

249 @abstractmethod 

250 def _artifact_exists(self, location): 

251 """Check that an artifact exists in this datastore at the specified 

252 location. 

253 

254 Parameters 

255 ---------- 

256 location : `Location` 

257 Expected location of the artifact associated with this datastore. 

258 

259 Returns 

260 ------- 

261 exists : `bool` 

262 True if the location can be found, false otherwise. 

263 """ 

264 raise NotImplementedError() 

265 

266 @abstractmethod 

267 def _delete_artifact(self, location): 

268 """Delete the artifact from the datastore. 

269 

270 Parameters 

271 ---------- 

272 location : `Location` 

273 Location of the artifact associated with this datastore. 

274 """ 

275 raise NotImplementedError() 

276 

277 def addStoredItemInfo(self, refs, infos): 

278 # Docstring inherited from GenericBaseDatastore 

279 records = [] 

280 for ref, info in zip(refs, infos): 

281 # Component should come from ref and fall back on info 

282 component = ref.datasetType.component() 

283 if component is None and info.component is not None: 283 ↛ 284line 283 didn't jump to line 284, because the condition on line 283 was never true

284 component = info.component 

285 if component is None: 

286 # Use empty string since we want this to be part of the 

287 # primary key. 

288 component = NULLSTR 

289 records.append( 

290 dict(dataset_id=ref.id, formatter=info.formatter, path=info.path, 

291 storage_class=info.storageClass.name, component=component, 

292 checksum=info.checksum, file_size=info.file_size) 

293 ) 

294 self.registry.insertOpaqueData(self._tableName, *records) 

295 

296 def getStoredItemInfo(self, ref): 

297 # Docstring inherited from GenericBaseDatastore 

298 

299 where = {"dataset_id": ref.id} 

300 

301 # If we have no component we want the row from this table without 

302 # a component. If we do have a component we either need the row 

303 # with no component or the row with the component, depending on how 

304 # this dataset was dissassembled. 

305 

306 # if we are emptying trash we won't have real refs so can't constrain 

307 # by component. Will need to fix this to return multiple matches 

308 # in future. 

309 try: 

310 component = ref.datasetType.component() 

311 except AttributeError: 

312 component = None 

313 else: 

314 if component is None: 

315 where["component"] = NULLSTR 

316 

317 # Look for the dataset_id -- there might be multiple matches 

318 # if we have disassembled the dataset. 

319 records = list(self.registry.fetchOpaqueData(self._tableName, **where)) 

320 if len(records) == 0: 320 ↛ 321line 320 didn't jump to line 321, because the condition on line 320 was never true

321 raise KeyError(f"Unable to retrieve location associated with dataset {ref}.") 

322 

323 # if we are not asking for a component 

324 if not component and len(records) != 1: 324 ↛ 325line 324 didn't jump to line 325, because the condition on line 324 was never true

325 raise RuntimeError(f"Got {len(records)} from location query of dataset {ref}") 

326 

327 # if we had a FakeDatasetRef we pick the first record regardless 

328 if isinstance(ref, FakeDatasetRef): 328 ↛ 329line 328 didn't jump to line 329, because the condition on line 328 was never true

329 record = records[0] 

330 else: 

331 records_by_component = {} 

332 for r in records: 

333 this_component = r["component"] if r["component"] and r["component"] != NULLSTR else None 

334 records_by_component[this_component] = r 

335 

336 # Look for component by name else fall back to the parent 

337 for lookup in (component, None): 337 ↛ 342line 337 didn't jump to line 342, because the loop on line 337 didn't complete

338 if lookup in records_by_component: 338 ↛ 337line 338 didn't jump to line 337, because the condition on line 338 was never false

339 record = records_by_component[lookup] 

340 break 

341 else: 

342 raise KeyError(f"Unable to retrieve location for component {component} associated with " 

343 f"dataset {ref}.") 

344 

345 # Convert name of StorageClass to instance 

346 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

347 

348 return StoredFileInfo(formatter=record["formatter"], 

349 path=record["path"], 

350 storageClass=storageClass, 

351 component=component, 

352 checksum=record["checksum"], 

353 file_size=record["file_size"]) 

354 

355 def getStoredItemsInfo(self, ref): 

356 # Docstring inherited from GenericBaseDatastore 

357 

358 # Look for the dataset_id -- there might be multiple matches 

359 # if we have disassembled the dataset. 

360 records = list(self.registry.fetchOpaqueData(self._tableName, dataset_id=ref.id)) 

361 

362 results = [] 

363 for record in records: 

364 # Convert name of StorageClass to instance 

365 storageClass = self.storageClassFactory.getStorageClass(record["storage_class"]) 

366 component = record["component"] if (record["component"] 

367 and record["component"] != NULLSTR) else None 

368 

369 info = StoredFileInfo(formatter=record["formatter"], 

370 path=record["path"], 

371 storageClass=storageClass, 

372 component=component, 

373 checksum=record["checksum"], 

374 file_size=record["file_size"]) 

375 results.append(info) 

376 

377 return results 

378 

379 def _registered_refs_per_artifact(self, pathInStore): 

380 """Return all dataset refs associated with the supplied path. 

381 

382 Parameters 

383 ---------- 

384 pathInStore : `str` 

385 Path of interest in the data store. 

386 

387 Returns 

388 ------- 

389 ids : `set` of `int` 

390 All `DatasetRef` IDs associated with this path. 

391 """ 

392 records = list(self.registry.fetchOpaqueData(self._tableName, path=pathInStore)) 

393 ids = {r["dataset_id"] for r in records} 

394 return ids 

395 

396 def removeStoredItemInfo(self, ref): 

397 # Docstring inherited from GenericBaseDatastore 

398 self.registry.deleteOpaqueData(self._tableName, dataset_id=ref.id) 

399 

400 def _get_dataset_location_info(self, ref): 

401 """Find the `Location` of the requested dataset in the 

402 `Datastore` and the associated stored file information. 

403 

404 Parameters 

405 ---------- 

406 ref : `DatasetRef` 

407 Reference to the required `Dataset`. 

408 

409 Returns 

410 ------- 

411 location : `Location` 

412 Location of the dataset within the datastore. 

413 Returns `None` if the dataset can not be located. 

414 info : `StoredFileInfo` 

415 Stored information about this file and its formatter. 

416 """ 

417 # Get the file information (this will fail if no file) 

418 try: 

419 storedFileInfo = self.getStoredItemInfo(ref) 

420 except KeyError: 

421 return None, None 

422 

423 # Use the path to determine the location 

424 location = self.locationFactory.fromPath(storedFileInfo.path) 

425 

426 return location, storedFileInfo 

427 

428 def _get_dataset_locations_info(self, ref): 

429 r"""Find all the `Location`\ s of the requested dataset in the 

430 `Datastore` and the associated stored file information. 

431 

432 Parameters 

433 ---------- 

434 ref : `DatasetRef` 

435 Reference to the required `Dataset`. 

436 

437 Returns 

438 ------- 

439 results : `list` [`tuple` [`Location`, `StoredFileInfo` ]] 

440 Location of the dataset within the datastore and 

441 stored information about each file and its formatter. 

442 """ 

443 # Get the file information (this will fail if no file) 

444 records = self.getStoredItemsInfo(ref) 

445 

446 # Use the path to determine the location 

447 return [(self.locationFactory.fromPath(r.path), r) for r in records] 

448 

449 def _can_remove_dataset_artifact(self, ref, location): 

450 """Check that there is only one dataset associated with the 

451 specified artifact. 

452 

453 Parameters 

454 ---------- 

455 ref : `DatasetRef` 

456 Dataset to be removed. 

457 location : `Location` 

458 The location of the artifact to be removed. 

459 

460 Returns 

461 ------- 

462 can_remove : `Bool` 

463 True if the artifact can be safely removed. 

464 """ 

465 

466 # Get all entries associated with this path 

467 allRefs = self._registered_refs_per_artifact(location.pathInStore) 

468 if not allRefs: 468 ↛ 469line 468 didn't jump to line 469, because the condition on line 468 was never true

469 raise RuntimeError(f"Datastore inconsistency error. {location.pathInStore} not in registry") 

470 

471 # Get all the refs associated with this dataset if it is a composite 

472 theseRefs = {r.id for r in ref.flatten([ref])} 

473 

474 # Remove these refs from all the refs and if there is nothing left 

475 # then we can delete 

476 remainingRefs = allRefs - theseRefs 

477 

478 if remainingRefs: 

479 return False 

480 return True 

481 

482 def _prepare_for_get(self, ref, parameters=None): 

483 """Check parameters for ``get`` and obtain formatter and 

484 location. 

485 

486 Parameters 

487 ---------- 

488 ref : `DatasetRef` 

489 Reference to the required Dataset. 

490 parameters : `dict` 

491 `StorageClass`-specific parameters that specify, for example, 

492 a slice of the dataset to be loaded. 

493 

494 Returns 

495 ------- 

496 getInfo : `list` [`DatastoreFileGetInformation`] 

497 Parameters needed to retrieve each file. 

498 """ 

499 log.debug("Retrieve %s from %s with parameters %s", ref, self.name, parameters) 

500 

501 # Get file metadata and internal metadata 

502 fileLocations = self._get_dataset_locations_info(ref) 

503 if not fileLocations: 

504 raise FileNotFoundError(f"Could not retrieve dataset {ref}.") 

505 

506 # The storage class we want to use eventually 

507 refStorageClass = ref.datasetType.storageClass 

508 

509 # Check that the supplied parameters are suitable for the type read 

510 refStorageClass.validateParameters(parameters) 

511 

512 if len(fileLocations) > 1: 

513 disassembled = True 

514 else: 

515 disassembled = False 

516 

517 # Is this a component request? 

518 refComponent = ref.datasetType.component() 

519 

520 fileGetInfo = [] 

521 for location, storedFileInfo in fileLocations: 

522 

523 # The storage class used to write the file 

524 writeStorageClass = storedFileInfo.storageClass 

525 

526 # If this has been disassembled we need read to match the write 

527 if disassembled: 

528 readStorageClass = writeStorageClass 

529 else: 

530 readStorageClass = refStorageClass 

531 

532 formatter = getInstanceOf(storedFileInfo.formatter, 

533 FileDescriptor(location, readStorageClass=readStorageClass, 

534 storageClass=writeStorageClass, parameters=parameters), 

535 ref.dataId) 

536 

537 _, notFormatterParams = formatter.segregateParameters() 

538 

539 # Of the remaining parameters, extract the ones supported by 

540 # this StorageClass (for components not all will be handled) 

541 assemblerParams = readStorageClass.filterParameters(notFormatterParams) 

542 

543 # The ref itself could be a component if the dataset was 

544 # disassembled by butler, or we disassembled in datastore and 

545 # components came from the datastore records 

546 if storedFileInfo.component: 

547 component = storedFileInfo.component 

548 else: 

549 component = refComponent 

550 

551 fileGetInfo.append(DatastoreFileGetInformation(location, formatter, storedFileInfo, 

552 assemblerParams, component, readStorageClass)) 

553 

554 return fileGetInfo 

555 

556 def _prepare_for_put(self, inMemoryDataset, ref): 

557 """Check the arguments for ``put`` and obtain formatter and 

558 location. 

559 

560 Parameters 

561 ---------- 

562 inMemoryDataset : `object` 

563 The dataset to store. 

564 ref : `DatasetRef` 

565 Reference to the associated Dataset. 

566 

567 Returns 

568 ------- 

569 location : `Location` 

570 The location to write the dataset. 

571 formatter : `Formatter` 

572 The `Formatter` to use to write the dataset. 

573 

574 Raises 

575 ------ 

576 TypeError 

577 Supplied object and storage class are inconsistent. 

578 DatasetTypeNotSupportedError 

579 The associated `DatasetType` is not handled by this datastore. 

580 """ 

581 self._validate_put_parameters(inMemoryDataset, ref) 

582 

583 # Work out output file name 

584 try: 

585 template = self.templates.getTemplate(ref) 

586 except KeyError as e: 

587 raise DatasetTypeNotSupportedError(f"Unable to find template for {ref}") from e 

588 

589 location = self.locationFactory.fromPath(template.format(ref)) 

590 

591 # Get the formatter based on the storage class 

592 storageClass = ref.datasetType.storageClass 

593 try: 

594 formatter = self.formatterFactory.getFormatter(ref, 

595 FileDescriptor(location, 

596 storageClass=storageClass), 

597 ref.dataId) 

598 except KeyError as e: 

599 raise DatasetTypeNotSupportedError(f"Unable to find formatter for {ref}") from e 

600 

601 return location, formatter 

602 

603 @abstractmethod 

604 def _standardizeIngestPath(self, path: str, *, transfer: Optional[str] = None) -> str: 

605 """Standardize the path of a to-be-ingested file. 

606 

607 Parameters 

608 ---------- 

609 path : `str` 

610 Path of a file to be ingested. 

611 transfer : `str`, optional 

612 How (and whether) the dataset should be added to the datastore. 

613 See `ingest` for details of transfer modes. 

614 This implementation is provided only so 

615 `NotImplementedError` can be raised if the mode is not supported; 

616 actual transfers are deferred to `_extractIngestInfo`. 

617 

618 Returns 

619 ------- 

620 path : `str` 

621 New path in what the datastore considers standard form. 

622 

623 Notes 

624 ----- 

625 Subclasses of `FileLikeDatastore` should implement this method instead 

626 of `_prepIngest`. It should not modify the data repository or given 

627 file in any way. 

628 

629 Raises 

630 ------ 

631 NotImplementedError 

632 Raised if the datastore does not support the given transfer mode 

633 (including the case where ingest is not supported at all). 

634 FileNotFoundError 

635 Raised if one of the given files does not exist. 

636 """ 

637 raise NotImplementedError("Must be implemented by subclasses.") 

638 

639 @abstractmethod 

640 def _extractIngestInfo(self, path: str, ref: DatasetRef, *, formatter: Type[Formatter], 

641 transfer: Optional[str] = None) -> StoredFileInfo: 

642 """Relocate (if necessary) and extract `StoredFileInfo` from a 

643 to-be-ingested file. 

644 

645 Parameters 

646 ---------- 

647 path : `str` 

648 Path of a file to be ingested. 

649 ref : `DatasetRef` 

650 Reference for the dataset being ingested. Guaranteed to have 

651 ``dataset_id not None`. 

652 formatter : `type` 

653 `Formatter` subclass to use for this dataset. 

654 transfer : `str`, optional 

655 How (and whether) the dataset should be added to the datastore. 

656 See `ingest` for details of transfer modes. 

657 

658 Returns 

659 ------- 

660 info : `StoredFileInfo` 

661 Internal datastore record for this file. This will be inserted by 

662 the caller; the `_extractIngestInfo` is only resposible for 

663 creating and populating the struct. 

664 

665 Raises 

666 ------ 

667 FileNotFoundError 

668 Raised if one of the given files does not exist. 

669 FileExistsError 

670 Raised if transfer is not `None` but the (internal) location the 

671 file would be moved to is already occupied. 

672 """ 

673 raise NotImplementedError("Must be implemented by subclasses.") 

674 

675 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> _IngestPrepData: 

676 # Docstring inherited from Datastore._prepIngest. 

677 filtered = [] 

678 for dataset in datasets: 

679 acceptable = [ref for ref in dataset.refs if self.constraints.isAcceptable(ref)] 

680 if not acceptable: 

681 continue 

682 else: 

683 dataset.refs = acceptable 

684 if dataset.formatter is None: 

685 dataset.formatter = self.formatterFactory.getFormatterClass(dataset.refs[0]) 

686 else: 

687 dataset.formatter = getClassOf(dataset.formatter) 

688 dataset.path = self._standardizeIngestPath(dataset.path, transfer=transfer) 

689 filtered.append(dataset) 

690 return _IngestPrepData(filtered) 

691 

692 @transactional 

693 def _finishIngest(self, prepData: Datastore.IngestPrepData, *, transfer: Optional[str] = None): 

694 # Docstring inherited from Datastore._finishIngest. 

695 refsAndInfos = [] 

696 for dataset in prepData.datasets: 

697 # Do ingest as if the first dataset ref is associated with the file 

698 info = self._extractIngestInfo(dataset.path, dataset.refs[0], formatter=dataset.formatter, 

699 transfer=transfer) 

700 refsAndInfos.extend([(ref, info) for ref in dataset.refs]) 

701 self._register_datasets(refsAndInfos) 

702 

703 def exists(self, ref): 

704 """Check if the dataset exists in the datastore. 

705 

706 Parameters 

707 ---------- 

708 ref : `DatasetRef` 

709 Reference to the required dataset. 

710 

711 Returns 

712 ------- 

713 exists : `bool` 

714 `True` if the entity exists in the `Datastore`. 

715 """ 

716 fileLocations = self._get_dataset_locations_info(ref) 

717 if not fileLocations: 

718 return False 

719 for location, _ in fileLocations: 

720 if not self._artifact_exists(location): 

721 return False 

722 

723 return True 

724 

725 def getUri(self, ref, predict=False): 

726 """URI to the Dataset. 

727 

728 Parameters 

729 ---------- 

730 ref : `DatasetRef` 

731 Reference to the required Dataset. 

732 predict : `bool` 

733 If `True`, allow URIs to be returned of datasets that have not 

734 been written. 

735 

736 Returns 

737 ------- 

738 uri : `str` 

739 URI string pointing to the dataset within the datastore. If the 

740 dataset does not exist in the datastore, and if ``predict`` is 

741 `True`, the URI will be a prediction and will include a URI 

742 fragment "#predicted". 

743 If the datastore does not have entities that relate well 

744 to the concept of a URI the returned URI string will be 

745 descriptive. The returned URI is not guaranteed to be obtainable. 

746 

747 Raises 

748 ------ 

749 FileNotFoundError 

750 A URI has been requested for a dataset that does not exist and 

751 guessing is not allowed. 

752 

753 Notes 

754 ----- 

755 When a predicted URI is requested an attempt will be made to form 

756 a reasonable URI based on file templates and the expected formatter. 

757 """ 

758 # if this has never been written then we have to guess 

759 if not self.exists(ref): 

760 if not predict: 

761 raise FileNotFoundError("Dataset {} not in this datastore".format(ref)) 

762 

763 template = self.templates.getTemplate(ref) 

764 location = self.locationFactory.fromPath(template.format(ref)) 

765 storageClass = ref.datasetType.storageClass 

766 formatter = self.formatterFactory.getFormatter(ref, FileDescriptor(location, 

767 storageClass=storageClass)) 

768 # Try to use the extension attribute but ignore problems if the 

769 # formatter does not define one. 

770 try: 

771 location = formatter.makeUpdatedLocation(location) 

772 except Exception: 

773 # Use the default extension 

774 pass 

775 

776 # Add a URI fragment to indicate this is a guess 

777 return location.uri + "#predicted" 

778 

779 # If this is a ref that we have written we can get the path. 

780 # Get file metadata and internal metadata 

781 storedFileInfo = self.getStoredItemInfo(ref) 

782 

783 # Use the path to determine the location 

784 location = self.locationFactory.fromPath(storedFileInfo.path) 

785 

786 return location.uri 

787 

788 def get(self, ref, parameters=None): 

789 """Load an InMemoryDataset from the store. 

790 

791 Parameters 

792 ---------- 

793 ref : `DatasetRef` 

794 Reference to the required Dataset. 

795 parameters : `dict` 

796 `StorageClass`-specific parameters that specify, for example, 

797 a slice of the dataset to be loaded. 

798 

799 Returns 

800 ------- 

801 inMemoryDataset : `object` 

802 Requested dataset or slice thereof as an InMemoryDataset. 

803 

804 Raises 

805 ------ 

806 FileNotFoundError 

807 Requested dataset can not be retrieved. 

808 TypeError 

809 Return value from formatter has unexpected type. 

810 ValueError 

811 Formatter failed to process the dataset. 

812 """ 

813 allGetInfo = self._prepare_for_get(ref, parameters) 

814 refComponent = ref.datasetType.component() 

815 

816 if len(allGetInfo) > 1 and not refComponent: 

817 # This was a disassembled dataset spread over multiple files 

818 # and we need to put them all back together again. 

819 # Read into memory and then assemble 

820 usedParams = set() 

821 components = {} 

822 for getInfo in allGetInfo: 

823 # assemblerParams are parameters not understood by the 

824 # associated formatter. 

825 usedParams.update(set(getInfo.assemblerParams)) 

826 

827 component = getInfo.component 

828 # We do not want the formatter to think it's reading 

829 # a component though because it is really reading a 

830 # standalone dataset -- always tell reader it is not a 

831 # component. 

832 components[component] = self._read_artifact_into_memory(getInfo, ref, isComponent=False) 

833 

834 inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

835 

836 # Any unused parameters will have to be passed to the assembler 

837 if parameters: 

838 unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

839 else: 

840 unusedParams = {} 

841 

842 # Process parameters 

843 return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

844 parameters=unusedParams) 

845 

846 else: 

847 # Single file request or component from that composite file 

848 allComponents = {i.component: i for i in allGetInfo} 

849 for lookup in (refComponent, None): 849 ↛ 854line 849 didn't jump to line 854, because the loop on line 849 didn't complete

850 if lookup in allComponents: 850 ↛ 849line 850 didn't jump to line 849, because the condition on line 850 was never false

851 getInfo = allComponents[lookup] 

852 break 

853 else: 

854 raise FileNotFoundError(f"Component {refComponent} not found " 

855 f"for ref {ref} in datastore {self.name}") 

856 

857 return self._read_artifact_into_memory(getInfo, ref, isComponent=getInfo.component is not None) 

858 

859 @transactional 

860 def put(self, inMemoryDataset, ref): 

861 """Write a InMemoryDataset with a given `DatasetRef` to the store. 

862 

863 Parameters 

864 ---------- 

865 inMemoryDataset : `object` 

866 The dataset to store. 

867 ref : `DatasetRef` 

868 Reference to the associated Dataset. 

869 

870 Raises 

871 ------ 

872 TypeError 

873 Supplied object and storage class are inconsistent. 

874 DatasetTypeNotSupportedError 

875 The associated `DatasetType` is not handled by this datastore. 

876 

877 Notes 

878 ----- 

879 If the datastore is configured to reject certain dataset types it 

880 is possible that the put will fail and raise a 

881 `DatasetTypeNotSupportedError`. The main use case for this is to 

882 allow `ChainedDatastore` to put to multiple datastores without 

883 requiring that every datastore accepts the dataset. 

884 """ 

885 

886 doDisassembly = self.composites.shouldBeDisassembled(ref) 

887 # doDisassembly = True 

888 

889 artifacts = [] 

890 if doDisassembly: 

891 components = ref.datasetType.storageClass.assembler().disassemble(inMemoryDataset) 

892 for component, componentInfo in components.items(): 

893 compTypeName = ref.datasetType.componentTypeName(component) 

894 # Don't recurse because we want to take advantage of 

895 # bulk insert -- need a new DatasetRef that refers to the 

896 # same dataset_id but has the component DatasetType 

897 # DatasetType does not refer to the types of components 

898 # So we construct one ourselves. 

899 compType = DatasetType(compTypeName, dimensions=ref.datasetType.dimensions, 

900 storageClass=componentInfo.storageClass) 

901 compRef = DatasetRef(compType, ref.dataId, id=ref.id, run=ref.run, conform=False) 

902 storedInfo = self._write_in_memory_to_artifact(componentInfo.component, compRef) 

903 artifacts.append((compRef, storedInfo)) 

904 else: 

905 # Write the entire thing out 

906 storedInfo = self._write_in_memory_to_artifact(inMemoryDataset, ref) 

907 artifacts.append((ref, storedInfo)) 

908 

909 self._register_datasets(artifacts) 

910 

911 @transactional 

912 def trash(self, ref, ignore_errors=True): 

913 """Indicate to the datastore that a dataset can be removed. 

914 

915 Parameters 

916 ---------- 

917 ref : `DatasetRef` 

918 Reference to the required Dataset. 

919 ignore_errors : `bool` 

920 If `True` return without error even if something went wrong. 

921 Problems could occur if another process is simultaneously trying 

922 to delete. 

923 

924 Raises 

925 ------ 

926 FileNotFoundError 

927 Attempt to remove a dataset that does not exist. 

928 """ 

929 # Get file metadata and internal metadata 

930 log.debug("Trashing %s in datastore %s", ref, self.name) 

931 

932 fileLocations = self._get_dataset_locations_info(ref) 

933 

934 if not fileLocations: 

935 err_msg = f"Requested dataset to trash ({ref}) is not known to datastore {self.name}" 

936 if ignore_errors: 

937 log.warning(err_msg) 

938 return 

939 else: 

940 raise FileNotFoundError(err_msg) 

941 

942 for location, storedFileInfo in fileLocations: 

943 if not self._artifact_exists(location): 943 ↛ 944line 943 didn't jump to line 944, because the condition on line 943 was never true

944 err_msg = f"Dataset is known to datastore {self.name} but " \ 

945 f"associated artifact ({location.uri}) is missing" 

946 if ignore_errors: 

947 log.warning(err_msg) 

948 return 

949 else: 

950 raise FileNotFoundError(err_msg) 

951 

952 # Mark dataset as trashed 

953 try: 

954 self._move_to_trash_in_registry(ref) 

955 except Exception as e: 

956 if ignore_errors: 

957 log.warning(f"Attempted to mark dataset ({ref}) to be trashed in datastore {self.name} " 

958 f"but encountered an error: {e}") 

959 pass 

960 else: 

961 raise 

962 

963 @transactional 

964 def emptyTrash(self, ignore_errors=True): 

965 """Remove all datasets from the trash. 

966 

967 Parameters 

968 ---------- 

969 ignore_errors : `bool` 

970 If `True` return without error even if something went wrong. 

971 Problems could occur if another process is simultaneously trying 

972 to delete. 

973 """ 

974 log.debug("Emptying trash in datastore %s", self.name) 

975 trashed = self.registry.getTrashedDatasets(self.name) 

976 

977 for ref in trashed: 

978 fileLocations = self._get_dataset_locations_info(ref) 

979 

980 for location, _ in fileLocations: 

981 

982 if location is None: 982 ↛ 983line 982 didn't jump to line 983, because the condition on line 982 was never true

983 err_msg = f"Requested dataset ({ref}) does not exist in datastore {self.name}" 

984 if ignore_errors: 

985 log.warning(err_msg) 

986 continue 

987 else: 

988 raise FileNotFoundError(err_msg) 

989 

990 if not self._artifact_exists(location): 990 ↛ 991line 990 didn't jump to line 991, because the condition on line 990 was never true

991 err_msg = f"Dataset {location.uri} no longer present in datastore {self.name}" 

992 if ignore_errors: 

993 log.warning(err_msg) 

994 continue 

995 else: 

996 raise FileNotFoundError(err_msg) 

997 

998 # Can only delete the artifact if there are no references 

999 # to the file from untrashed dataset refs. 

1000 if self._can_remove_dataset_artifact(ref, location): 

1001 # Point of no return for this artifact 

1002 log.debug("Removing artifact %s from datastore %s", location.uri, self.name) 

1003 try: 

1004 self._delete_artifact(location) 

1005 except Exception as e: 

1006 if ignore_errors: 

1007 log.critical("Encountered error removing artifact %s from datastore %s: %s", 

1008 location.uri, self.name, e) 

1009 else: 

1010 raise 

1011 

1012 # Now must remove the entry from the internal registry even if 

1013 # the artifact removal failed and was ignored, 

1014 # otherwise the removal check above will never be true 

1015 try: 

1016 # There may be multiple rows associated with this ref 

1017 # depending on disassembly 

1018 self.removeStoredItemInfo(ref) 

1019 except Exception as e: 

1020 if ignore_errors: 

1021 log.warning("Error removing dataset %s (%s) from internal registry of %s: %s", 

1022 ref.id, location.uri, self.name, e) 

1023 continue 

1024 else: 

1025 raise 

1026 

1027 # Inform registry that we have removed items from datastore 

1028 # This should work even if another process is clearing out those rows 

1029 self.registry.emptyDatasetLocationsTrash(self.name, trashed) 

1030 

1031 def validateConfiguration(self, entities, logFailures=False): 

1032 """Validate some of the configuration for this datastore. 

1033 

1034 Parameters 

1035 ---------- 

1036 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1037 Entities to test against this configuration. Can be differing 

1038 types. 

1039 logFailures : `bool`, optional 

1040 If `True`, output a log message for every validation error 

1041 detected. 

1042 

1043 Raises 

1044 ------ 

1045 DatastoreValidationError 

1046 Raised if there is a validation problem with a configuration. 

1047 All the problems are reported in a single exception. 

1048 

1049 Notes 

1050 ----- 

1051 This method checks that all the supplied entities have valid file 

1052 templates and also have formatters defined. 

1053 """ 

1054 

1055 templateFailed = None 

1056 try: 

1057 self.templates.validateTemplates(entities, logFailures=logFailures) 

1058 except FileTemplateValidationError as e: 

1059 templateFailed = str(e) 

1060 

1061 formatterFailed = [] 

1062 for entity in entities: 

1063 try: 

1064 self.formatterFactory.getFormatterClass(entity) 

1065 except KeyError as e: 

1066 formatterFailed.append(str(e)) 

1067 if logFailures: 1067 ↛ 1062line 1067 didn't jump to line 1062, because the condition on line 1067 was never false

1068 log.fatal("Formatter failure: %s", e) 

1069 

1070 if templateFailed or formatterFailed: 

1071 messages = [] 

1072 if templateFailed: 1072 ↛ 1073line 1072 didn't jump to line 1073, because the condition on line 1072 was never true

1073 messages.append(templateFailed) 

1074 if formatterFailed: 1074 ↛ 1076line 1074 didn't jump to line 1076, because the condition on line 1074 was never false

1075 messages.append(",".join(formatterFailed)) 

1076 msg = ";\n".join(messages) 

1077 raise DatastoreValidationError(msg) 

1078 

1079 def getLookupKeys(self): 

1080 # Docstring is inherited from base class 

1081 return self.templates.getLookupKeys() | self.formatterFactory.getLookupKeys() | \ 

1082 self.constraints.getLookupKeys() 

1083 

1084 def validateKey(self, lookupKey, entity): 

1085 # Docstring is inherited from base class 

1086 # The key can be valid in either formatters or templates so we can 

1087 # only check the template if it exists 

1088 if lookupKey in self.templates: 

1089 try: 

1090 self.templates[lookupKey].validateTemplate(entity) 

1091 except FileTemplateValidationError as e: 

1092 raise DatastoreValidationError(e) from e