Coverage for python/lsst/daf/butler/core/datastore.py: 42%

246 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-12 02:19 -0800

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for generic data stores.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs") 

27 

28import contextlib 

29import dataclasses 

30import logging 

31from abc import ABCMeta, abstractmethod 

32from collections import abc, defaultdict 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 Callable, 

37 ClassVar, 

38 Dict, 

39 Iterable, 

40 Iterator, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.utils import doImportType 

51 

52from .config import Config, ConfigSubset 

53from .constraints import Constraints 

54from .exceptions import DatasetTypeNotSupportedError, ValidationError 

55from .fileDataset import FileDataset 

56from .storageClass import StorageClassFactory 

57 

58if TYPE_CHECKING: 58 ↛ 59line 58 didn't jump to line 59, because the condition on line 58 was never true

59 from lsst.resources import ResourcePath, ResourcePathExpression 

60 

61 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

62 from .configSupport import LookupKey 

63 from .datasets import DatasetRef, DatasetType 

64 from .datastoreRecordData import DatastoreRecordData 

65 from .storageClass import StorageClass 

66 

67 

68class DatastoreConfig(ConfigSubset): 

69 """Configuration for Datastores.""" 

70 

71 component = "datastore" 

72 requiredKeys = ("cls",) 

73 defaultConfigFile = "datastore.yaml" 

74 

75 

76class DatastoreValidationError(ValidationError): 

77 """There is a problem with the Datastore configuration.""" 

78 

79 pass 

80 

81 

82@dataclasses.dataclass(frozen=True) 

83class Event: 

84 __slots__ = {"name", "undoFunc", "args", "kwargs"} 

85 name: str 

86 undoFunc: Callable 

87 args: tuple 

88 kwargs: dict 

89 

90 

91class IngestPrepData: 

92 """A helper base class for `Datastore` ingest implementations. 

93 

94 Datastore implementations will generally need a custom implementation of 

95 this class. 

96 

97 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct 

98 import. 

99 

100 Parameters 

101 ---------- 

102 refs : iterable of `DatasetRef` 

103 References for the datasets that can be ingested by this datastore. 

104 """ 

105 

106 def __init__(self, refs: Iterable[DatasetRef]): 

107 self.refs = {ref.id: ref for ref in refs} 

108 

109 

110class DatastoreTransaction: 

111 """Keeps a log of `Datastore` activity and allow rollback. 

112 

113 Parameters 

114 ---------- 

115 parent : `DatastoreTransaction`, optional 

116 The parent transaction (if any) 

117 """ 

118 

119 Event: ClassVar[Type] = Event 

120 

121 parent: Optional[DatastoreTransaction] 

122 """The parent transaction. (`DatastoreTransaction`, optional)""" 

123 

124 def __init__(self, parent: Optional[DatastoreTransaction] = None): 

125 self.parent = parent 

126 self._log: List[Event] = [] 

127 

128 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None: 

129 """Register event with undo function. 

130 

131 Parameters 

132 ---------- 

133 name : `str` 

134 Name of the event. 

135 undoFunc : func 

136 Function to undo this event. 

137 args : `tuple` 

138 Positional arguments to `undoFunc`. 

139 **kwargs 

140 Keyword arguments to `undoFunc`. 

141 """ 

142 self._log.append(self.Event(name, undoFunc, args, kwargs)) 

143 

144 @contextlib.contextmanager 

145 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]: 

146 """Register undo function if nested operation succeeds. 

147 

148 Calls `registerUndo`. 

149 

150 This can be used to wrap individual undo-able statements within a 

151 DatastoreTransaction block. Multiple statements that can fail 

152 separately should not be part of the same `undoWith` block. 

153 

154 All arguments are forwarded directly to `registerUndo`. 

155 """ 

156 try: 

157 yield None 

158 except BaseException: 

159 raise 

160 else: 

161 self.registerUndo(name, undoFunc, *args, **kwargs) 

162 

163 def rollback(self) -> None: 

164 """Roll back all events in this transaction.""" 

165 log = logging.getLogger(__name__) 

166 while self._log: 

167 ev = self._log.pop() 

168 try: 

169 log.debug( 

170 "Rolling back transaction: %s: %s(%s,%s)", 

171 ev.name, 

172 ev.undoFunc, 

173 ",".join(str(a) for a in ev.args), 

174 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()), 

175 ) 

176 except Exception: 

177 # In case we had a problem in stringification of arguments 

178 log.warning("Rolling back transaction: %s", ev.name) 

179 try: 

180 ev.undoFunc(*ev.args, **ev.kwargs) 

181 except BaseException as e: 

182 # Deliberately swallow error that may occur in unrolling 

183 log.warning("Exception: %s caught while unrolling: %s", e, ev.name) 

184 pass 

185 

186 def commit(self) -> None: 

187 """Commit this transaction.""" 

188 if self.parent is None: 

189 # Just forget about the events, they have already happened. 

190 return 

191 else: 

192 # We may still want to events from this transaction as part of 

193 # the parent. 

194 self.parent._log.extend(self._log) 

195 

196 

197@dataclasses.dataclass 

198class DatasetRefURIs(abc.Sequence): 

199 """Represents the primary and component ResourcePath(s) associated with a 

200 DatasetRef. 

201 

202 This is used in places where its members used to be represented as a tuple 

203 `(primaryURI, componentURIs)`. To maintain backward compatibility this 

204 inherits from Sequence and so instances can be treated as a two-item 

205 tuple. 

206 """ 

207 

208 def __init__( 

209 self, 

210 primaryURI: Optional[ResourcePath] = None, 

211 componentURIs: Optional[Dict[str, ResourcePath]] = None, 

212 ): 

213 

214 self.primaryURI = primaryURI 

215 """The URI to the primary artifact associated with this dataset. If the 

216 dataset was disassembled within the datastore this may be `None`. 

217 """ 

218 

219 self.componentURIs = componentURIs or {} 

220 """The URIs to any components associated with the dataset artifact 

221 indexed by component name. This can be empty if there are no 

222 components. 

223 """ 

224 

225 def __getitem__(self, index: Any) -> Any: 

226 """Get primaryURI and componentURIs by index. 

227 

228 Provides support for tuple-like access. 

229 """ 

230 if index == 0: 

231 return self.primaryURI 

232 elif index == 1: 

233 return self.componentURIs 

234 raise IndexError("list index out of range") 

235 

236 def __len__(self) -> int: 

237 """Get the number of data members. 

238 

239 Provides support for tuple-like access. 

240 """ 

241 return 2 

242 

243 def __repr__(self) -> str: 

244 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})" 

245 

246 

247class Datastore(metaclass=ABCMeta): 

248 """Datastore interface. 

249 

250 Parameters 

251 ---------- 

252 config : `DatastoreConfig` or `str` 

253 Load configuration either from an existing config instance or by 

254 referring to a configuration file. 

255 bridgeManager : `DatastoreRegistryBridgeManager` 

256 Object that manages the interface between `Registry` and datastores. 

257 butlerRoot : `str`, optional 

258 New datastore root to use to override the configuration value. 

259 """ 

260 

261 defaultConfigFile: ClassVar[Optional[str]] = None 

262 """Path to configuration defaults. Accessed within the ``config`` resource 

263 or relative to a search path. Can be None if no defaults specified. 

264 """ 

265 

266 containerKey: ClassVar[Optional[str]] = None 

267 """Name of the key containing a list of subconfigurations that also 

268 need to be merged with defaults and will likely use different Python 

269 datastore classes (but all using DatastoreConfig). Assumed to be a 

270 list of configurations that can be represented in a DatastoreConfig 

271 and containing a "cls" definition. None indicates that no containers 

272 are expected in this Datastore.""" 

273 

274 isEphemeral: bool = False 

275 """Indicate whether this Datastore is ephemeral or not. An ephemeral 

276 datastore is one where the contents of the datastore will not exist 

277 across process restarts. This value can change per-instance.""" 

278 

279 config: DatastoreConfig 

280 """Configuration used to create Datastore.""" 

281 

282 name: str 

283 """Label associated with this Datastore.""" 

284 

285 storageClassFactory: StorageClassFactory 

286 """Factory for creating storage class instances from name.""" 

287 

288 constraints: Constraints 

289 """Constraints to apply when putting datasets into the datastore.""" 

290 

291 # MyPy does not like for this to be annotated as any kind of type, because 

292 # it can't do static checking on type variables that can change at runtime. 

293 IngestPrepData: ClassVar[Any] = IngestPrepData 

294 """Helper base class for ingest implementations. 

295 """ 

296 

297 @classmethod 

298 @abstractmethod 

299 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

300 """Set filesystem-dependent config options for this datastore. 

301 

302 The options will be appropriate for a new empty repository with the 

303 given root. 

304 

305 Parameters 

306 ---------- 

307 root : `str` 

308 Filesystem path to the root of the data repository. 

309 config : `Config` 

310 A `Config` to update. Only the subset understood by 

311 this component will be updated. Will not expand 

312 defaults. 

313 full : `Config` 

314 A complete config with all defaults expanded that can be 

315 converted to a `DatastoreConfig`. Read-only and will not be 

316 modified by this method. 

317 Repository-specific options that should not be obtained 

318 from defaults when Butler instances are constructed 

319 should be copied from ``full`` to ``config``. 

320 overwrite : `bool`, optional 

321 If `False`, do not modify a value in ``config`` if the value 

322 already exists. Default is always to overwrite with the provided 

323 ``root``. 

324 

325 Notes 

326 ----- 

327 If a keyword is explicitly defined in the supplied ``config`` it 

328 will not be overridden by this method if ``overwrite`` is `False`. 

329 This allows explicit values set in external configs to be retained. 

330 """ 

331 raise NotImplementedError() 

332 

333 @staticmethod 

334 def fromConfig( 

335 config: Config, 

336 bridgeManager: DatastoreRegistryBridgeManager, 

337 butlerRoot: Optional[ResourcePathExpression] = None, 

338 ) -> "Datastore": 

339 """Create datastore from type specified in config file. 

340 

341 Parameters 

342 ---------- 

343 config : `Config` 

344 Configuration instance. 

345 bridgeManager : `DatastoreRegistryBridgeManager` 

346 Object that manages the interface between `Registry` and 

347 datastores. 

348 butlerRoot : `str`, optional 

349 Butler root directory. 

350 """ 

351 cls = doImportType(config["datastore", "cls"]) 

352 if not issubclass(cls, Datastore): 

353 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore") 

354 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot) 

355 

356 def __init__( 

357 self, 

358 config: Union[Config, str], 

359 bridgeManager: DatastoreRegistryBridgeManager, 

360 butlerRoot: Optional[ResourcePathExpression] = None, 

361 ): 

362 self.config = DatastoreConfig(config) 

363 self.name = "ABCDataStore" 

364 self._transaction: Optional[DatastoreTransaction] = None 

365 

366 # All Datastores need storage classes and constraints 

367 self.storageClassFactory = StorageClassFactory() 

368 

369 # And read the constraints list 

370 constraintsConfig = self.config.get("constraints") 

371 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe) 

372 

373 def __str__(self) -> str: 

374 return self.name 

375 

376 def __repr__(self) -> str: 

377 return self.name 

378 

379 @property 

380 def names(self) -> Tuple[str, ...]: 

381 """Names associated with this datastore returned as a list. 

382 

383 Can be different to ``name`` for a chaining datastore. 

384 """ 

385 # Default implementation returns solely the name itself 

386 return (self.name,) 

387 

388 @contextlib.contextmanager 

389 def transaction(self) -> Iterator[DatastoreTransaction]: 

390 """Context manager supporting `Datastore` transactions. 

391 

392 Transactions can be nested, and are to be used in combination with 

393 `Registry.transaction`. 

394 """ 

395 self._transaction = DatastoreTransaction(self._transaction) 

396 try: 

397 yield self._transaction 

398 except BaseException: 

399 self._transaction.rollback() 

400 raise 

401 else: 

402 self._transaction.commit() 

403 self._transaction = self._transaction.parent 

404 

405 @abstractmethod 

406 def knows(self, ref: DatasetRef) -> bool: 

407 """Check if the dataset is known to the datastore. 

408 

409 Does not check for existence of any artifact. 

410 

411 Parameters 

412 ---------- 

413 ref : `DatasetRef` 

414 Reference to the required dataset. 

415 

416 Returns 

417 ------- 

418 exists : `bool` 

419 `True` if the dataset is known to the datastore. 

420 """ 

421 raise NotImplementedError() 

422 

423 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

424 """Check which of the given datasets are known to this datastore. 

425 

426 This is like ``mexist()`` but does not check that the file exists. 

427 

428 Parameters 

429 ---------- 

430 refs : iterable `DatasetRef` 

431 The datasets to check. 

432 

433 Returns 

434 ------- 

435 exists : `dict`[`DatasetRef`, `bool`] 

436 Mapping of dataset to boolean indicating whether the dataset 

437 is known to the datastore. 

438 """ 

439 # Non-optimized default calls knows() repeatedly. 

440 return {ref: self.knows(ref) for ref in refs} 

441 

442 def mexists( 

443 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

444 ) -> Dict[DatasetRef, bool]: 

445 """Check the existence of multiple datasets at once. 

446 

447 Parameters 

448 ---------- 

449 refs : iterable of `DatasetRef` 

450 The datasets to be checked. 

451 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

452 Optional mapping of datastore artifact to existence. Updated by 

453 this method with details of all artifacts tested. Can be `None` 

454 if the caller is not interested. 

455 

456 Returns 

457 ------- 

458 existence : `dict` of [`DatasetRef`, `bool`] 

459 Mapping from dataset to boolean indicating existence. 

460 """ 

461 existence: Dict[DatasetRef, bool] = {} 

462 # Non-optimized default. 

463 for ref in refs: 

464 existence[ref] = self.exists(ref) 

465 return existence 

466 

467 @abstractmethod 

468 def exists(self, datasetRef: DatasetRef) -> bool: 

469 """Check if the dataset exists in the datastore. 

470 

471 Parameters 

472 ---------- 

473 datasetRef : `DatasetRef` 

474 Reference to the required dataset. 

475 

476 Returns 

477 ------- 

478 exists : `bool` 

479 `True` if the entity exists in the `Datastore`. 

480 """ 

481 raise NotImplementedError("Must be implemented by subclass") 

482 

483 @abstractmethod 

484 def get( 

485 self, 

486 datasetRef: DatasetRef, 

487 parameters: Mapping[str, Any] | None = None, 

488 storageClass: Optional[Union[StorageClass, str]] = None, 

489 ) -> Any: 

490 """Load an `InMemoryDataset` from the store. 

491 

492 Parameters 

493 ---------- 

494 datasetRef : `DatasetRef` 

495 Reference to the required Dataset. 

496 parameters : `dict` 

497 `StorageClass`-specific parameters that specify a slice of the 

498 Dataset to be loaded. 

499 storageClass : `StorageClass` or `str`, optional 

500 The storage class to be used to override the Python type 

501 returned by this method. By default the returned type matches 

502 the dataset type definition for this dataset. Specifying a 

503 read `StorageClass` can force a different type to be returned. 

504 This type must be compatible with the original type. 

505 

506 Returns 

507 ------- 

508 inMemoryDataset : `object` 

509 Requested Dataset or slice thereof as an InMemoryDataset. 

510 """ 

511 raise NotImplementedError("Must be implemented by subclass") 

512 

513 @abstractmethod 

514 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None: 

515 """Write a `InMemoryDataset` with a given `DatasetRef` to the store. 

516 

517 Parameters 

518 ---------- 

519 inMemoryDataset : `object` 

520 The Dataset to store. 

521 datasetRef : `DatasetRef` 

522 Reference to the associated Dataset. 

523 """ 

524 raise NotImplementedError("Must be implemented by subclass") 

525 

526 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

527 """Allow ingest transfer mode to be defaulted based on datasets. 

528 

529 Parameters 

530 ---------- 

531 datasets : `FileDataset` 

532 Each positional argument is a struct containing information about 

533 a file to be ingested, including its path (either absolute or 

534 relative to the datastore root, if applicable), a complete 

535 `DatasetRef` (with ``dataset_id not None``), and optionally a 

536 formatter class or its fully-qualified string name. If a formatter 

537 is not provided, this method should populate that attribute with 

538 the formatter the datastore would use for `put`. Subclasses are 

539 also permitted to modify the path attribute (typically to put it 

540 in what the datastore considers its standard form). 

541 transfer : `str`, optional 

542 How (and whether) the dataset should be added to the datastore. 

543 See `ingest` for details of transfer modes. 

544 

545 Returns 

546 ------- 

547 newTransfer : `str` 

548 Transfer mode to use. Will be identical to the supplied transfer 

549 mode unless "auto" is used. 

550 """ 

551 if transfer != "auto": 

552 return transfer 

553 raise RuntimeError(f"{transfer} is not allowed without specialization.") 

554 

555 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData: 

556 """Process datasets to identify which ones can be ingested. 

557 

558 Parameters 

559 ---------- 

560 datasets : `FileDataset` 

561 Each positional argument is a struct containing information about 

562 a file to be ingested, including its path (either absolute or 

563 relative to the datastore root, if applicable), a complete 

564 `DatasetRef` (with ``dataset_id not None``), and optionally a 

565 formatter class or its fully-qualified string name. If a formatter 

566 is not provided, this method should populate that attribute with 

567 the formatter the datastore would use for `put`. Subclasses are 

568 also permitted to modify the path attribute (typically to put it 

569 in what the datastore considers its standard form). 

570 transfer : `str`, optional 

571 How (and whether) the dataset should be added to the datastore. 

572 See `ingest` for details of transfer modes. 

573 

574 Returns 

575 ------- 

576 data : `IngestPrepData` 

577 An instance of a subclass of `IngestPrepData`, used to pass 

578 arbitrary data from `_prepIngest` to `_finishIngest`. This should 

579 include only the datasets this datastore can actually ingest; 

580 others should be silently ignored (`Datastore.ingest` will inspect 

581 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if 

582 necessary). 

583 

584 Raises 

585 ------ 

586 NotImplementedError 

587 Raised if the datastore does not support the given transfer mode 

588 (including the case where ingest is not supported at all). 

589 FileNotFoundError 

590 Raised if one of the given files does not exist. 

591 FileExistsError 

592 Raised if transfer is not `None` but the (internal) location the 

593 file would be moved to is already occupied. 

594 

595 Notes 

596 ----- 

597 This method (along with `_finishIngest`) should be implemented by 

598 subclasses to provide ingest support instead of implementing `ingest` 

599 directly. 

600 

601 `_prepIngest` should not modify the data repository or given files in 

602 any way; all changes should be deferred to `_finishIngest`. 

603 

604 When possible, exceptions should be raised in `_prepIngest` instead of 

605 `_finishIngest`. `NotImplementedError` exceptions that indicate that 

606 the transfer mode is not supported must be raised by `_prepIngest` 

607 instead of `_finishIngest`. 

608 """ 

609 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

610 

611 def _finishIngest( 

612 self, prepData: IngestPrepData, *, transfer: Optional[str] = None, record_validation_info: bool = True 

613 ) -> None: 

614 """Complete an ingest operation. 

615 

616 Parameters 

617 ---------- 

618 data : `IngestPrepData` 

619 An instance of a subclass of `IngestPrepData`. Guaranteed to be 

620 the direct result of a call to `_prepIngest` on this datastore. 

621 transfer : `str`, optional 

622 How (and whether) the dataset should be added to the datastore. 

623 See `ingest` for details of transfer modes. 

624 record_validation_info : `bool`, optional 

625 If `True`, the default, the datastore can record validation 

626 information associated with the file. If `False` the datastore 

627 will not attempt to track any information such as checksums 

628 or file sizes. This can be useful if such information is tracked 

629 in an external system or if the file is to be compressed in place. 

630 It is up to the datastore whether this parameter is relevant. 

631 

632 Raises 

633 ------ 

634 FileNotFoundError 

635 Raised if one of the given files does not exist. 

636 FileExistsError 

637 Raised if transfer is not `None` but the (internal) location the 

638 file would be moved to is already occupied. 

639 

640 Notes 

641 ----- 

642 This method (along with `_prepIngest`) should be implemented by 

643 subclasses to provide ingest support instead of implementing `ingest` 

644 directly. 

645 """ 

646 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

647 

648 def ingest( 

649 self, *datasets: FileDataset, transfer: Optional[str] = None, record_validation_info: bool = True 

650 ) -> None: 

651 """Ingest one or more files into the datastore. 

652 

653 Parameters 

654 ---------- 

655 datasets : `FileDataset` 

656 Each positional argument is a struct containing information about 

657 a file to be ingested, including its path (either absolute or 

658 relative to the datastore root, if applicable), a complete 

659 `DatasetRef` (with ``dataset_id not None``), and optionally a 

660 formatter class or its fully-qualified string name. If a formatter 

661 is not provided, the one the datastore would use for ``put`` on 

662 that dataset is assumed. 

663 transfer : `str`, optional 

664 How (and whether) the dataset should be added to the datastore. 

665 If `None` (default), the file must already be in a location 

666 appropriate for the datastore (e.g. within its root directory), 

667 and will not be modified. Other choices include "move", "copy", 

668 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

669 special transfer mode that will first try to make a hardlink and 

670 if that fails a symlink will be used instead. "relsymlink" creates 

671 a relative symlink rather than use an absolute path. 

672 Most datastores do not support all transfer modes. 

673 "auto" is a special option that will let the 

674 data store choose the most natural option for itself. 

675 record_validation_info : `bool`, optional 

676 If `True`, the default, the datastore can record validation 

677 information associated with the file. If `False` the datastore 

678 will not attempt to track any information such as checksums 

679 or file sizes. This can be useful if such information is tracked 

680 in an external system or if the file is to be compressed in place. 

681 It is up to the datastore whether this parameter is relevant. 

682 

683 Raises 

684 ------ 

685 NotImplementedError 

686 Raised if the datastore does not support the given transfer mode 

687 (including the case where ingest is not supported at all). 

688 DatasetTypeNotSupportedError 

689 Raised if one or more files to be ingested have a dataset type that 

690 is not supported by the datastore. 

691 FileNotFoundError 

692 Raised if one of the given files does not exist. 

693 FileExistsError 

694 Raised if transfer is not `None` but the (internal) location the 

695 file would be moved to is already occupied. 

696 

697 Notes 

698 ----- 

699 Subclasses should implement `_prepIngest` and `_finishIngest` instead 

700 of implementing `ingest` directly. Datastores that hold and 

701 delegate to child datastores may want to call those methods as well. 

702 

703 Subclasses are encouraged to document their supported transfer modes 

704 in their class documentation. 

705 """ 

706 # Allow a datastore to select a default transfer mode 

707 transfer = self._overrideTransferMode(*datasets, transfer=transfer) 

708 prepData = self._prepIngest(*datasets, transfer=transfer) 

709 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs} 

710 if None in refs: 

711 # Find the file for the error message. There may be multiple 

712 # bad refs so look for all of them. 

713 unresolved_paths = {} 

714 for dataset in datasets: 

715 unresolved = [] 

716 for ref in dataset.refs: 

717 if ref.id is None: 

718 unresolved.append(ref) 

719 if unresolved: 

720 unresolved_paths[dataset.path] = unresolved 

721 raise RuntimeError( 

722 "Attempt to ingest unresolved DatasetRef from: " 

723 + ",".join(f"{p}: ({[str(r) for r in ref]})" for p, ref in unresolved_paths.items()) 

724 ) 

725 if refs.keys() != prepData.refs.keys(): 

726 unsupported = refs.keys() - prepData.refs.keys() 

727 # Group unsupported refs by DatasetType for an informative 

728 # but still concise error message. 

729 byDatasetType = defaultdict(list) 

730 for datasetId in unsupported: 

731 ref = refs[datasetId] 

732 byDatasetType[ref.datasetType].append(ref) 

733 raise DatasetTypeNotSupportedError( 

734 "DatasetType(s) not supported in ingest: " 

735 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items()) 

736 ) 

737 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info) 

738 

739 def transfer_from( 

740 self, 

741 source_datastore: Datastore, 

742 refs: Iterable[DatasetRef], 

743 local_refs: Optional[Iterable[DatasetRef]] = None, 

744 transfer: str = "auto", 

745 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

746 ) -> None: 

747 """Transfer dataset artifacts from another datastore to this one. 

748 

749 Parameters 

750 ---------- 

751 source_datastore : `Datastore` 

752 The datastore from which to transfer artifacts. That datastore 

753 must be compatible with this datastore receiving the artifacts. 

754 refs : iterable of `DatasetRef` 

755 The datasets to transfer from the source datastore. 

756 local_refs : iterable of `DatasetRef`, optional 

757 The dataset refs associated with the registry associated with 

758 this datastore. Can be `None` if the source and target datastore 

759 are using UUIDs. 

760 transfer : `str`, optional 

761 How (and whether) the dataset should be added to the datastore. 

762 Choices include "move", "copy", 

763 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

764 special transfer mode that will first try to make a hardlink and 

765 if that fails a symlink will be used instead. "relsymlink" creates 

766 a relative symlink rather than use an absolute path. 

767 Most datastores do not support all transfer modes. 

768 "auto" (the default) is a special option that will let the 

769 data store choose the most natural option for itself. 

770 If the source location and transfer location are identical the 

771 transfer mode will be ignored. 

772 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

773 Optional mapping of datastore artifact to existence. Updated by 

774 this method with details of all artifacts tested. Can be `None` 

775 if the caller is not interested. 

776 

777 Raises 

778 ------ 

779 TypeError 

780 Raised if the two datastores are not compatible. 

781 """ 

782 if type(self) is not type(source_datastore): 

783 raise TypeError( 

784 f"Datastore mismatch between this datastore ({type(self)}) and the " 

785 f"source datastore ({type(source_datastore)})." 

786 ) 

787 

788 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.") 

789 

790 def getManyURIs( 

791 self, 

792 refs: Iterable[DatasetRef], 

793 predict: bool = False, 

794 allow_missing: bool = False, 

795 ) -> Dict[DatasetRef, DatasetRefURIs]: 

796 """Return URIs associated with many datasets. 

797 

798 Parameters 

799 ---------- 

800 refs : iterable of `DatasetIdRef` 

801 References to the required datasets. 

802 predict : `bool`, optional 

803 If the datastore does not know about a dataset, should it 

804 return a predicted URI or not? 

805 allow_missing : `bool` 

806 If `False`, and `predict` is `False`, will raise if a `DatasetRef` 

807 does not exist. 

808 

809 Returns 

810 ------- 

811 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`] 

812 A dict of primary and component URIs, indexed by the passed-in 

813 refs. 

814 

815 Raises 

816 ------ 

817 FileNotFoundError 

818 A URI has been requested for a dataset that does not exist and 

819 guessing is not allowed. 

820 

821 Notes 

822 ----- 

823 In file-based datastores, getManuURIs does not check that the file is 

824 really there, it's assuming it is if datastore is aware of the file 

825 then it actually exists. 

826 """ 

827 uris: Dict[DatasetRef, DatasetRefURIs] = {} 

828 missing_refs = [] 

829 for ref in refs: 

830 try: 

831 uris[ref] = self.getURIs(ref, predict=predict) 

832 except FileNotFoundError: 

833 missing_refs.append(ref) 

834 if missing_refs and not allow_missing: 

835 raise FileNotFoundError( 

836 "Missing {} refs from datastore out of {} and predict=False.".format( 

837 num_missing := len(missing_refs), num_missing + len(uris) 

838 ) 

839 ) 

840 return uris 

841 

842 @abstractmethod 

843 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

844 """Return URIs associated with dataset. 

845 

846 Parameters 

847 ---------- 

848 ref : `DatasetRef` 

849 Reference to the required dataset. 

850 predict : `bool`, optional 

851 If the datastore does not know about the dataset, should it 

852 return a predicted URI or not? 

853 

854 Returns 

855 ------- 

856 uris : `DatasetRefURIs` 

857 The URI to the primary artifact associated with this dataset (if 

858 the dataset was disassembled within the datastore this may be 

859 `None`), and the URIs to any components associated with the dataset 

860 artifact. (can be empty if there are no components). 

861 """ 

862 raise NotImplementedError() 

863 

864 @abstractmethod 

865 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath: 

866 """URI to the Dataset. 

867 

868 Parameters 

869 ---------- 

870 datasetRef : `DatasetRef` 

871 Reference to the required Dataset. 

872 predict : `bool` 

873 If `True` attempt to predict the URI for a dataset if it does 

874 not exist in datastore. 

875 

876 Returns 

877 ------- 

878 uri : `str` 

879 URI string pointing to the Dataset within the datastore. If the 

880 Dataset does not exist in the datastore, the URI may be a guess. 

881 If the datastore does not have entities that relate well 

882 to the concept of a URI the returned URI string will be 

883 descriptive. The returned URI is not guaranteed to be obtainable. 

884 

885 Raises 

886 ------ 

887 FileNotFoundError 

888 A URI has been requested for a dataset that does not exist and 

889 guessing is not allowed. 

890 """ 

891 raise NotImplementedError("Must be implemented by subclass") 

892 

893 @abstractmethod 

894 def retrieveArtifacts( 

895 self, 

896 refs: Iterable[DatasetRef], 

897 destination: ResourcePath, 

898 transfer: str = "auto", 

899 preserve_path: bool = True, 

900 overwrite: bool = False, 

901 ) -> List[ResourcePath]: 

902 """Retrieve the artifacts associated with the supplied refs. 

903 

904 Parameters 

905 ---------- 

906 refs : iterable of `DatasetRef` 

907 The datasets for which artifacts are to be retrieved. 

908 A single ref can result in multiple artifacts. The refs must 

909 be resolved. 

910 destination : `lsst.resources.ResourcePath` 

911 Location to write the artifacts. 

912 transfer : `str`, optional 

913 Method to use to transfer the artifacts. Must be one of the options 

914 supported by `lsst.resources.ResourcePath.transfer_from()`. 

915 "move" is not allowed. 

916 preserve_path : `bool`, optional 

917 If `True` the full path of the artifact within the datastore 

918 is preserved. If `False` the final file component of the path 

919 is used. 

920 overwrite : `bool`, optional 

921 If `True` allow transfers to overwrite existing files at the 

922 destination. 

923 

924 Returns 

925 ------- 

926 targets : `list` of `lsst.resources.ResourcePath` 

927 URIs of file artifacts in destination location. Order is not 

928 preserved. 

929 

930 Notes 

931 ----- 

932 For non-file datastores the artifacts written to the destination 

933 may not match the representation inside the datastore. For example 

934 a hierarchichal data structure in a NoSQL database may well be stored 

935 as a JSON file. 

936 """ 

937 raise NotImplementedError() 

938 

939 @abstractmethod 

940 def remove(self, datasetRef: DatasetRef) -> None: 

941 """Indicate to the Datastore that a Dataset can be removed. 

942 

943 Parameters 

944 ---------- 

945 datasetRef : `DatasetRef` 

946 Reference to the required Dataset. 

947 

948 Raises 

949 ------ 

950 FileNotFoundError 

951 When Dataset does not exist. 

952 

953 Notes 

954 ----- 

955 Some Datastores may implement this method as a silent no-op to 

956 disable Dataset deletion through standard interfaces. 

957 """ 

958 raise NotImplementedError("Must be implemented by subclass") 

959 

960 @abstractmethod 

961 def forget(self, refs: Iterable[DatasetRef]) -> None: 

962 """Indicate to the Datastore that it should remove all records of the 

963 given datasets, without actually deleting them. 

964 

965 Parameters 

966 ---------- 

967 refs : `Iterable` [ `DatasetRef` ] 

968 References to the datasets being forgotten. 

969 

970 Notes 

971 ----- 

972 Asking a datastore to forget a `DatasetRef` it does not hold should be 

973 a silent no-op, not an error. 

974 """ 

975 raise NotImplementedError("Must be implemented by subclass") 

976 

977 @abstractmethod 

978 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

979 """Indicate to the Datastore that a Dataset can be moved to the trash. 

980 

981 Parameters 

982 ---------- 

983 ref : `DatasetRef` or iterable thereof 

984 Reference(s) to the required Dataset. 

985 ignore_errors : `bool`, optional 

986 Determine whether errors should be ignored. When multiple 

987 refs are being trashed there will be no per-ref check. 

988 

989 Raises 

990 ------ 

991 FileNotFoundError 

992 When Dataset does not exist and errors are not ignored. Only 

993 checked if a single ref is supplied (and not in a list). 

994 

995 Notes 

996 ----- 

997 Some Datastores may implement this method as a silent no-op to 

998 disable Dataset deletion through standard interfaces. 

999 """ 

1000 raise NotImplementedError("Must be implemented by subclass") 

1001 

1002 @abstractmethod 

1003 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1004 """Remove all datasets from the trash. 

1005 

1006 Parameters 

1007 ---------- 

1008 ignore_errors : `bool`, optional 

1009 Determine whether errors should be ignored. 

1010 

1011 Notes 

1012 ----- 

1013 Some Datastores may implement this method as a silent no-op to 

1014 disable Dataset deletion through standard interfaces. 

1015 """ 

1016 raise NotImplementedError("Must be implemented by subclass") 

1017 

1018 @abstractmethod 

1019 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None: 

1020 """Transfer a dataset from another datastore to this datastore. 

1021 

1022 Parameters 

1023 ---------- 

1024 inputDatastore : `Datastore` 

1025 The external `Datastore` from which to retrieve the Dataset. 

1026 datasetRef : `DatasetRef` 

1027 Reference to the required Dataset. 

1028 """ 

1029 raise NotImplementedError("Must be implemented by subclass") 

1030 

1031 def export( 

1032 self, 

1033 refs: Iterable[DatasetRef], 

1034 *, 

1035 directory: Optional[ResourcePathExpression] = None, 

1036 transfer: Optional[str] = "auto", 

1037 ) -> Iterable[FileDataset]: 

1038 """Export datasets for transfer to another data repository. 

1039 

1040 Parameters 

1041 ---------- 

1042 refs : iterable of `DatasetRef` 

1043 Dataset references to be exported. 

1044 directory : `str`, optional 

1045 Path to a directory that should contain files corresponding to 

1046 output datasets. Ignored if ``transfer`` is explicitly `None`. 

1047 transfer : `str`, optional 

1048 Mode that should be used to move datasets out of the repository. 

1049 Valid options are the same as those of the ``transfer`` argument 

1050 to ``ingest``, and datastores may similarly signal that a transfer 

1051 mode is not supported by raising `NotImplementedError`. If "auto" 

1052 is given and no ``directory`` is specified, `None` will be 

1053 implied. 

1054 

1055 Returns 

1056 ------- 

1057 dataset : iterable of `DatasetTransfer` 

1058 Structs containing information about the exported datasets, in the 

1059 same order as ``refs``. 

1060 

1061 Raises 

1062 ------ 

1063 NotImplementedError 

1064 Raised if the given transfer mode is not supported. 

1065 """ 

1066 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

1067 

1068 @abstractmethod 

1069 def validateConfiguration( 

1070 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

1071 ) -> None: 

1072 """Validate some of the configuration for this datastore. 

1073 

1074 Parameters 

1075 ---------- 

1076 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1077 Entities to test against this configuration. Can be differing 

1078 types. 

1079 logFailures : `bool`, optional 

1080 If `True`, output a log message for every validation error 

1081 detected. 

1082 

1083 Raises 

1084 ------ 

1085 DatastoreValidationError 

1086 Raised if there is a validation problem with a configuration. 

1087 

1088 Notes 

1089 ----- 

1090 Which parts of the configuration are validated is at the discretion 

1091 of each Datastore implementation. 

1092 """ 

1093 raise NotImplementedError("Must be implemented by subclass") 

1094 

1095 @abstractmethod 

1096 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1097 """Validate a specific look up key with supplied entity. 

1098 

1099 Parameters 

1100 ---------- 

1101 lookupKey : `LookupKey` 

1102 Key to use to retrieve information from the datastore 

1103 configuration. 

1104 entity : `DatasetRef`, `DatasetType`, or `StorageClass` 

1105 Entity to compare with configuration retrieved using the 

1106 specified lookup key. 

1107 

1108 Raises 

1109 ------ 

1110 DatastoreValidationError 

1111 Raised if there is a problem with the combination of entity 

1112 and lookup key. 

1113 

1114 Notes 

1115 ----- 

1116 Bypasses the normal selection priorities by allowing a key that 

1117 would normally not be selected to be validated. 

1118 """ 

1119 raise NotImplementedError("Must be implemented by subclass") 

1120 

1121 @abstractmethod 

1122 def getLookupKeys(self) -> Set[LookupKey]: 

1123 """Return all the lookup keys relevant to this datastore. 

1124 

1125 Returns 

1126 ------- 

1127 keys : `set` of `LookupKey` 

1128 The keys stored internally for looking up information based 

1129 on `DatasetType` name or `StorageClass`. 

1130 """ 

1131 raise NotImplementedError("Must be implemented by subclass") 

1132 

1133 def needs_expanded_data_ids( 

1134 self, 

1135 transfer: Optional[str], 

1136 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

1137 ) -> bool: 

1138 """Test whether this datastore needs expanded data IDs to ingest. 

1139 

1140 Parameters 

1141 ---------- 

1142 transfer : `str` or `None` 

1143 Transfer mode for ingest. 

1144 entity, optional 

1145 Object representing what will be ingested. If not provided (or not 

1146 specific enough), `True` may be returned even if expanded data 

1147 IDs aren't necessary. 

1148 

1149 Returns 

1150 ------- 

1151 needed : `bool` 

1152 If `True`, expanded data IDs may be needed. `False` only if 

1153 expansion definitely isn't necessary. 

1154 """ 

1155 return True 

1156 

1157 @abstractmethod 

1158 def import_records( 

1159 self, 

1160 data: Mapping[str, DatastoreRecordData], 

1161 ) -> None: 

1162 """Import datastore location and record data from an in-memory data 

1163 structure. 

1164 

1165 Parameters 

1166 ---------- 

1167 data : `Mapping` [ `str`, `DatastoreRecordData` ] 

1168 Datastore records indexed by datastore name. May contain data for 

1169 other `Datastore` instances (generally because they are chained to 

1170 this one), which should be ignored. 

1171 

1172 Notes 

1173 ----- 

1174 Implementations should generally not check that any external resources 

1175 (e.g. files) referred to by these records actually exist, for 

1176 performance reasons; we expect higher-level code to guarantee that they 

1177 do. 

1178 

1179 Implementations are responsible for calling 

1180 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations`` 

1181 where the key is in `names`, as well as loading any opaque table data. 

1182 """ 

1183 raise NotImplementedError() 

1184 

1185 @abstractmethod 

1186 def export_records( 

1187 self, 

1188 refs: Iterable[DatasetIdRef], 

1189 ) -> Mapping[str, DatastoreRecordData]: 

1190 """Export datastore records and locations to an in-memory data 

1191 structure. 

1192 

1193 Parameters 

1194 ---------- 

1195 refs : `Iterable` [ `DatasetIdRef` ] 

1196 Datasets to save. This may include datasets not known to this 

1197 datastore, which should be ignored. 

1198 

1199 Returns 

1200 ------- 

1201 data : `Mapping` [ `str`, `DatastoreRecordData` ] 

1202 Exported datastore records indexed by datastore name. 

1203 """ 

1204 raise NotImplementedError()