Coverage for python/lsst/daf/butler/core/datastore.py: 46%

219 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-17 09:33 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for generic data stores.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs") 

27 

28import contextlib 

29import dataclasses 

30import logging 

31from abc import ABCMeta, abstractmethod 

32from collections import abc, defaultdict 

33from typing import ( 

34 TYPE_CHECKING, 

35 Any, 

36 Callable, 

37 ClassVar, 

38 Dict, 

39 Iterable, 

40 Iterator, 

41 List, 

42 Mapping, 

43 Optional, 

44 Set, 

45 Tuple, 

46 Type, 

47 Union, 

48) 

49 

50from lsst.utils import doImportType 

51 

52from .config import Config, ConfigSubset 

53from .constraints import Constraints 

54from .exceptions import DatasetTypeNotSupportedError, ValidationError 

55from .fileDataset import FileDataset 

56from .storageClass import StorageClassFactory 

57 

58if TYPE_CHECKING: 

59 from lsst.resources import ResourcePath, ResourcePathExpression 

60 

61 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

62 from .configSupport import LookupKey 

63 from .datasets import DatasetRef, DatasetType 

64 from .datastoreRecordData import DatastoreRecordData 

65 from .storageClass import StorageClass 

66 

67 

68class DatastoreConfig(ConfigSubset): 

69 """Configuration for Datastores.""" 

70 

71 component = "datastore" 

72 requiredKeys = ("cls",) 

73 defaultConfigFile = "datastore.yaml" 

74 

75 

76class DatastoreValidationError(ValidationError): 

77 """There is a problem with the Datastore configuration.""" 

78 

79 pass 

80 

81 

82@dataclasses.dataclass(frozen=True) 

83class Event: 

84 __slots__ = {"name", "undoFunc", "args", "kwargs"} 

85 name: str 

86 undoFunc: Callable 

87 args: tuple 

88 kwargs: dict 

89 

90 

91class IngestPrepData: 

92 """A helper base class for `Datastore` ingest implementations. 

93 

94 Datastore implementations will generally need a custom implementation of 

95 this class. 

96 

97 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct 

98 import. 

99 

100 Parameters 

101 ---------- 

102 refs : iterable of `DatasetRef` 

103 References for the datasets that can be ingested by this datastore. 

104 """ 

105 

106 def __init__(self, refs: Iterable[DatasetRef]): 

107 self.refs = {ref.id: ref for ref in refs} 

108 

109 

110class DatastoreTransaction: 

111 """Keeps a log of `Datastore` activity and allow rollback. 

112 

113 Parameters 

114 ---------- 

115 parent : `DatastoreTransaction`, optional 

116 The parent transaction (if any) 

117 """ 

118 

119 Event: ClassVar[Type] = Event 

120 

121 parent: Optional[DatastoreTransaction] 

122 """The parent transaction. (`DatastoreTransaction`, optional)""" 

123 

124 def __init__(self, parent: Optional[DatastoreTransaction] = None): 

125 self.parent = parent 

126 self._log: List[Event] = [] 

127 

128 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None: 

129 """Register event with undo function. 

130 

131 Parameters 

132 ---------- 

133 name : `str` 

134 Name of the event. 

135 undoFunc : func 

136 Function to undo this event. 

137 args : `tuple` 

138 Positional arguments to `undoFunc`. 

139 **kwargs 

140 Keyword arguments to `undoFunc`. 

141 """ 

142 self._log.append(self.Event(name, undoFunc, args, kwargs)) 

143 

144 @contextlib.contextmanager 

145 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]: 

146 """Register undo function if nested operation succeeds. 

147 

148 Calls `registerUndo`. 

149 

150 This can be used to wrap individual undo-able statements within a 

151 DatastoreTransaction block. Multiple statements that can fail 

152 separately should not be part of the same `undoWith` block. 

153 

154 All arguments are forwarded directly to `registerUndo`. 

155 """ 

156 try: 

157 yield None 

158 except BaseException: 

159 raise 

160 else: 

161 self.registerUndo(name, undoFunc, *args, **kwargs) 

162 

163 def rollback(self) -> None: 

164 """Roll back all events in this transaction.""" 

165 log = logging.getLogger(__name__) 

166 while self._log: 

167 ev = self._log.pop() 

168 try: 

169 log.debug( 

170 "Rolling back transaction: %s: %s(%s,%s)", 

171 ev.name, 

172 ev.undoFunc, 

173 ",".join(str(a) for a in ev.args), 

174 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()), 

175 ) 

176 except Exception: 

177 # In case we had a problem in stringification of arguments 

178 log.warning("Rolling back transaction: %s", ev.name) 

179 try: 

180 ev.undoFunc(*ev.args, **ev.kwargs) 

181 except BaseException as e: 

182 # Deliberately swallow error that may occur in unrolling 

183 log.warning("Exception: %s caught while unrolling: %s", e, ev.name) 

184 pass 

185 

186 def commit(self) -> None: 

187 """Commit this transaction.""" 

188 if self.parent is None: 

189 # Just forget about the events, they have already happened. 

190 return 

191 else: 

192 # We may still want to events from this transaction as part of 

193 # the parent. 

194 self.parent._log.extend(self._log) 

195 

196 

197@dataclasses.dataclass 

198class DatasetRefURIs(abc.Sequence): 

199 """Represents the primary and component ResourcePath(s) associated with a 

200 DatasetRef. 

201 

202 This is used in places where its members used to be represented as a tuple 

203 `(primaryURI, componentURIs)`. To maintain backward compatibility this 

204 inherits from Sequence and so instances can be treated as a two-item 

205 tuple. 

206 """ 

207 

208 def __init__( 

209 self, 

210 primaryURI: Optional[ResourcePath] = None, 

211 componentURIs: Optional[Dict[str, ResourcePath]] = None, 

212 ): 

213 self.primaryURI = primaryURI 

214 """The URI to the primary artifact associated with this dataset. If the 

215 dataset was disassembled within the datastore this may be `None`. 

216 """ 

217 

218 self.componentURIs = componentURIs or {} 

219 """The URIs to any components associated with the dataset artifact 

220 indexed by component name. This can be empty if there are no 

221 components. 

222 """ 

223 

224 def __getitem__(self, index: Any) -> Any: 

225 """Get primaryURI and componentURIs by index. 

226 

227 Provides support for tuple-like access. 

228 """ 

229 if index == 0: 

230 return self.primaryURI 

231 elif index == 1: 

232 return self.componentURIs 

233 raise IndexError("list index out of range") 

234 

235 def __len__(self) -> int: 

236 """Get the number of data members. 

237 

238 Provides support for tuple-like access. 

239 """ 

240 return 2 

241 

242 def __repr__(self) -> str: 

243 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})" 

244 

245 

246class Datastore(metaclass=ABCMeta): 

247 """Datastore interface. 

248 

249 Parameters 

250 ---------- 

251 config : `DatastoreConfig` or `str` 

252 Load configuration either from an existing config instance or by 

253 referring to a configuration file. 

254 bridgeManager : `DatastoreRegistryBridgeManager` 

255 Object that manages the interface between `Registry` and datastores. 

256 butlerRoot : `str`, optional 

257 New datastore root to use to override the configuration value. 

258 """ 

259 

260 defaultConfigFile: ClassVar[Optional[str]] = None 

261 """Path to configuration defaults. Accessed within the ``config`` resource 

262 or relative to a search path. Can be None if no defaults specified. 

263 """ 

264 

265 containerKey: ClassVar[Optional[str]] = None 

266 """Name of the key containing a list of subconfigurations that also 

267 need to be merged with defaults and will likely use different Python 

268 datastore classes (but all using DatastoreConfig). Assumed to be a 

269 list of configurations that can be represented in a DatastoreConfig 

270 and containing a "cls" definition. None indicates that no containers 

271 are expected in this Datastore.""" 

272 

273 isEphemeral: bool = False 

274 """Indicate whether this Datastore is ephemeral or not. An ephemeral 

275 datastore is one where the contents of the datastore will not exist 

276 across process restarts. This value can change per-instance.""" 

277 

278 config: DatastoreConfig 

279 """Configuration used to create Datastore.""" 

280 

281 name: str 

282 """Label associated with this Datastore.""" 

283 

284 storageClassFactory: StorageClassFactory 

285 """Factory for creating storage class instances from name.""" 

286 

287 constraints: Constraints 

288 """Constraints to apply when putting datasets into the datastore.""" 

289 

290 # MyPy does not like for this to be annotated as any kind of type, because 

291 # it can't do static checking on type variables that can change at runtime. 

292 IngestPrepData: ClassVar[Any] = IngestPrepData 

293 """Helper base class for ingest implementations. 

294 """ 

295 

296 @classmethod 

297 @abstractmethod 

298 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

299 """Set filesystem-dependent config options for this datastore. 

300 

301 The options will be appropriate for a new empty repository with the 

302 given root. 

303 

304 Parameters 

305 ---------- 

306 root : `str` 

307 Filesystem path to the root of the data repository. 

308 config : `Config` 

309 A `Config` to update. Only the subset understood by 

310 this component will be updated. Will not expand 

311 defaults. 

312 full : `Config` 

313 A complete config with all defaults expanded that can be 

314 converted to a `DatastoreConfig`. Read-only and will not be 

315 modified by this method. 

316 Repository-specific options that should not be obtained 

317 from defaults when Butler instances are constructed 

318 should be copied from ``full`` to ``config``. 

319 overwrite : `bool`, optional 

320 If `False`, do not modify a value in ``config`` if the value 

321 already exists. Default is always to overwrite with the provided 

322 ``root``. 

323 

324 Notes 

325 ----- 

326 If a keyword is explicitly defined in the supplied ``config`` it 

327 will not be overridden by this method if ``overwrite`` is `False`. 

328 This allows explicit values set in external configs to be retained. 

329 """ 

330 raise NotImplementedError() 

331 

332 @staticmethod 

333 def fromConfig( 

334 config: Config, 

335 bridgeManager: DatastoreRegistryBridgeManager, 

336 butlerRoot: Optional[ResourcePathExpression] = None, 

337 ) -> "Datastore": 

338 """Create datastore from type specified in config file. 

339 

340 Parameters 

341 ---------- 

342 config : `Config` 

343 Configuration instance. 

344 bridgeManager : `DatastoreRegistryBridgeManager` 

345 Object that manages the interface between `Registry` and 

346 datastores. 

347 butlerRoot : `str`, optional 

348 Butler root directory. 

349 """ 

350 cls = doImportType(config["datastore", "cls"]) 

351 if not issubclass(cls, Datastore): 

352 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore") 

353 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot) 

354 

355 def __init__( 

356 self, 

357 config: Union[Config, str], 

358 bridgeManager: DatastoreRegistryBridgeManager, 

359 butlerRoot: Optional[ResourcePathExpression] = None, 

360 ): 

361 self.config = DatastoreConfig(config) 

362 self.name = "ABCDataStore" 

363 self._transaction: Optional[DatastoreTransaction] = None 

364 

365 # All Datastores need storage classes and constraints 

366 self.storageClassFactory = StorageClassFactory() 

367 

368 # And read the constraints list 

369 constraintsConfig = self.config.get("constraints") 

370 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe) 

371 

372 def __str__(self) -> str: 

373 return self.name 

374 

375 def __repr__(self) -> str: 

376 return self.name 

377 

378 @property 

379 def names(self) -> Tuple[str, ...]: 

380 """Names associated with this datastore returned as a list. 

381 

382 Can be different to ``name`` for a chaining datastore. 

383 """ 

384 # Default implementation returns solely the name itself 

385 return (self.name,) 

386 

387 @contextlib.contextmanager 

388 def transaction(self) -> Iterator[DatastoreTransaction]: 

389 """Context manager supporting `Datastore` transactions. 

390 

391 Transactions can be nested, and are to be used in combination with 

392 `Registry.transaction`. 

393 """ 

394 self._transaction = DatastoreTransaction(self._transaction) 

395 try: 

396 yield self._transaction 

397 except BaseException: 

398 self._transaction.rollback() 

399 raise 

400 else: 

401 self._transaction.commit() 

402 self._transaction = self._transaction.parent 

403 

404 @abstractmethod 

405 def knows(self, ref: DatasetRef) -> bool: 

406 """Check if the dataset is known to the datastore. 

407 

408 Does not check for existence of any artifact. 

409 

410 Parameters 

411 ---------- 

412 ref : `DatasetRef` 

413 Reference to the required dataset. 

414 

415 Returns 

416 ------- 

417 exists : `bool` 

418 `True` if the dataset is known to the datastore. 

419 """ 

420 raise NotImplementedError() 

421 

422 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

423 """Check which of the given datasets are known to this datastore. 

424 

425 This is like ``mexist()`` but does not check that the file exists. 

426 

427 Parameters 

428 ---------- 

429 refs : iterable `DatasetRef` 

430 The datasets to check. 

431 

432 Returns 

433 ------- 

434 exists : `dict`[`DatasetRef`, `bool`] 

435 Mapping of dataset to boolean indicating whether the dataset 

436 is known to the datastore. 

437 """ 

438 # Non-optimized default calls knows() repeatedly. 

439 return {ref: self.knows(ref) for ref in refs} 

440 

441 def mexists( 

442 self, refs: Iterable[DatasetRef], artifact_existence: Optional[Dict[ResourcePath, bool]] = None 

443 ) -> Dict[DatasetRef, bool]: 

444 """Check the existence of multiple datasets at once. 

445 

446 Parameters 

447 ---------- 

448 refs : iterable of `DatasetRef` 

449 The datasets to be checked. 

450 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

451 Optional mapping of datastore artifact to existence. Updated by 

452 this method with details of all artifacts tested. Can be `None` 

453 if the caller is not interested. 

454 

455 Returns 

456 ------- 

457 existence : `dict` of [`DatasetRef`, `bool`] 

458 Mapping from dataset to boolean indicating existence. 

459 """ 

460 existence: Dict[DatasetRef, bool] = {} 

461 # Non-optimized default. 

462 for ref in refs: 

463 existence[ref] = self.exists(ref) 

464 return existence 

465 

466 @abstractmethod 

467 def exists(self, datasetRef: DatasetRef) -> bool: 

468 """Check if the dataset exists in the datastore. 

469 

470 Parameters 

471 ---------- 

472 datasetRef : `DatasetRef` 

473 Reference to the required dataset. 

474 

475 Returns 

476 ------- 

477 exists : `bool` 

478 `True` if the entity exists in the `Datastore`. 

479 """ 

480 raise NotImplementedError("Must be implemented by subclass") 

481 

482 @abstractmethod 

483 def get( 

484 self, 

485 datasetRef: DatasetRef, 

486 parameters: Mapping[str, Any] | None = None, 

487 storageClass: Optional[Union[StorageClass, str]] = None, 

488 ) -> Any: 

489 """Load an `InMemoryDataset` from the store. 

490 

491 Parameters 

492 ---------- 

493 datasetRef : `DatasetRef` 

494 Reference to the required Dataset. 

495 parameters : `dict` 

496 `StorageClass`-specific parameters that specify a slice of the 

497 Dataset to be loaded. 

498 storageClass : `StorageClass` or `str`, optional 

499 The storage class to be used to override the Python type 

500 returned by this method. By default the returned type matches 

501 the dataset type definition for this dataset. Specifying a 

502 read `StorageClass` can force a different type to be returned. 

503 This type must be compatible with the original type. 

504 

505 Returns 

506 ------- 

507 inMemoryDataset : `object` 

508 Requested Dataset or slice thereof as an InMemoryDataset. 

509 """ 

510 raise NotImplementedError("Must be implemented by subclass") 

511 

512 @abstractmethod 

513 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None: 

514 """Write a `InMemoryDataset` with a given `DatasetRef` to the store. 

515 

516 Parameters 

517 ---------- 

518 inMemoryDataset : `object` 

519 The Dataset to store. 

520 datasetRef : `DatasetRef` 

521 Reference to the associated Dataset. 

522 """ 

523 raise NotImplementedError("Must be implemented by subclass") 

524 

525 def _overrideTransferMode(self, *datasets: FileDataset, transfer: Optional[str] = None) -> Optional[str]: 

526 """Allow ingest transfer mode to be defaulted based on datasets. 

527 

528 Parameters 

529 ---------- 

530 datasets : `FileDataset` 

531 Each positional argument is a struct containing information about 

532 a file to be ingested, including its path (either absolute or 

533 relative to the datastore root, if applicable), a complete 

534 `DatasetRef` (with ``dataset_id not None``), and optionally a 

535 formatter class or its fully-qualified string name. If a formatter 

536 is not provided, this method should populate that attribute with 

537 the formatter the datastore would use for `put`. Subclasses are 

538 also permitted to modify the path attribute (typically to put it 

539 in what the datastore considers its standard form). 

540 transfer : `str`, optional 

541 How (and whether) the dataset should be added to the datastore. 

542 See `ingest` for details of transfer modes. 

543 

544 Returns 

545 ------- 

546 newTransfer : `str` 

547 Transfer mode to use. Will be identical to the supplied transfer 

548 mode unless "auto" is used. 

549 """ 

550 if transfer != "auto": 

551 return transfer 

552 raise RuntimeError(f"{transfer} is not allowed without specialization.") 

553 

554 def _prepIngest(self, *datasets: FileDataset, transfer: Optional[str] = None) -> IngestPrepData: 

555 """Process datasets to identify which ones can be ingested. 

556 

557 Parameters 

558 ---------- 

559 datasets : `FileDataset` 

560 Each positional argument is a struct containing information about 

561 a file to be ingested, including its path (either absolute or 

562 relative to the datastore root, if applicable), a complete 

563 `DatasetRef` (with ``dataset_id not None``), and optionally a 

564 formatter class or its fully-qualified string name. If a formatter 

565 is not provided, this method should populate that attribute with 

566 the formatter the datastore would use for `put`. Subclasses are 

567 also permitted to modify the path attribute (typically to put it 

568 in what the datastore considers its standard form). 

569 transfer : `str`, optional 

570 How (and whether) the dataset should be added to the datastore. 

571 See `ingest` for details of transfer modes. 

572 

573 Returns 

574 ------- 

575 data : `IngestPrepData` 

576 An instance of a subclass of `IngestPrepData`, used to pass 

577 arbitrary data from `_prepIngest` to `_finishIngest`. This should 

578 include only the datasets this datastore can actually ingest; 

579 others should be silently ignored (`Datastore.ingest` will inspect 

580 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if 

581 necessary). 

582 

583 Raises 

584 ------ 

585 NotImplementedError 

586 Raised if the datastore does not support the given transfer mode 

587 (including the case where ingest is not supported at all). 

588 FileNotFoundError 

589 Raised if one of the given files does not exist. 

590 FileExistsError 

591 Raised if transfer is not `None` but the (internal) location the 

592 file would be moved to is already occupied. 

593 

594 Notes 

595 ----- 

596 This method (along with `_finishIngest`) should be implemented by 

597 subclasses to provide ingest support instead of implementing `ingest` 

598 directly. 

599 

600 `_prepIngest` should not modify the data repository or given files in 

601 any way; all changes should be deferred to `_finishIngest`. 

602 

603 When possible, exceptions should be raised in `_prepIngest` instead of 

604 `_finishIngest`. `NotImplementedError` exceptions that indicate that 

605 the transfer mode is not supported must be raised by `_prepIngest` 

606 instead of `_finishIngest`. 

607 """ 

608 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

609 

610 def _finishIngest( 

611 self, prepData: IngestPrepData, *, transfer: Optional[str] = None, record_validation_info: bool = True 

612 ) -> None: 

613 """Complete an ingest operation. 

614 

615 Parameters 

616 ---------- 

617 data : `IngestPrepData` 

618 An instance of a subclass of `IngestPrepData`. Guaranteed to be 

619 the direct result of a call to `_prepIngest` on this datastore. 

620 transfer : `str`, optional 

621 How (and whether) the dataset should be added to the datastore. 

622 See `ingest` for details of transfer modes. 

623 record_validation_info : `bool`, optional 

624 If `True`, the default, the datastore can record validation 

625 information associated with the file. If `False` the datastore 

626 will not attempt to track any information such as checksums 

627 or file sizes. This can be useful if such information is tracked 

628 in an external system or if the file is to be compressed in place. 

629 It is up to the datastore whether this parameter is relevant. 

630 

631 Raises 

632 ------ 

633 FileNotFoundError 

634 Raised if one of the given files does not exist. 

635 FileExistsError 

636 Raised if transfer is not `None` but the (internal) location the 

637 file would be moved to is already occupied. 

638 

639 Notes 

640 ----- 

641 This method (along with `_prepIngest`) should be implemented by 

642 subclasses to provide ingest support instead of implementing `ingest` 

643 directly. 

644 """ 

645 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

646 

647 def ingest( 

648 self, *datasets: FileDataset, transfer: Optional[str] = None, record_validation_info: bool = True 

649 ) -> None: 

650 """Ingest one or more files into the datastore. 

651 

652 Parameters 

653 ---------- 

654 datasets : `FileDataset` 

655 Each positional argument is a struct containing information about 

656 a file to be ingested, including its path (either absolute or 

657 relative to the datastore root, if applicable), a complete 

658 `DatasetRef` (with ``dataset_id not None``), and optionally a 

659 formatter class or its fully-qualified string name. If a formatter 

660 is not provided, the one the datastore would use for ``put`` on 

661 that dataset is assumed. 

662 transfer : `str`, optional 

663 How (and whether) the dataset should be added to the datastore. 

664 If `None` (default), the file must already be in a location 

665 appropriate for the datastore (e.g. within its root directory), 

666 and will not be modified. Other choices include "move", "copy", 

667 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

668 special transfer mode that will first try to make a hardlink and 

669 if that fails a symlink will be used instead. "relsymlink" creates 

670 a relative symlink rather than use an absolute path. 

671 Most datastores do not support all transfer modes. 

672 "auto" is a special option that will let the 

673 data store choose the most natural option for itself. 

674 record_validation_info : `bool`, optional 

675 If `True`, the default, the datastore can record validation 

676 information associated with the file. If `False` the datastore 

677 will not attempt to track any information such as checksums 

678 or file sizes. This can be useful if such information is tracked 

679 in an external system or if the file is to be compressed in place. 

680 It is up to the datastore whether this parameter is relevant. 

681 

682 Raises 

683 ------ 

684 NotImplementedError 

685 Raised if the datastore does not support the given transfer mode 

686 (including the case where ingest is not supported at all). 

687 DatasetTypeNotSupportedError 

688 Raised if one or more files to be ingested have a dataset type that 

689 is not supported by the datastore. 

690 FileNotFoundError 

691 Raised if one of the given files does not exist. 

692 FileExistsError 

693 Raised if transfer is not `None` but the (internal) location the 

694 file would be moved to is already occupied. 

695 

696 Notes 

697 ----- 

698 Subclasses should implement `_prepIngest` and `_finishIngest` instead 

699 of implementing `ingest` directly. Datastores that hold and 

700 delegate to child datastores may want to call those methods as well. 

701 

702 Subclasses are encouraged to document their supported transfer modes 

703 in their class documentation. 

704 """ 

705 # Allow a datastore to select a default transfer mode 

706 transfer = self._overrideTransferMode(*datasets, transfer=transfer) 

707 prepData = self._prepIngest(*datasets, transfer=transfer) 

708 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs} 

709 if None in refs: 

710 # Find the file for the error message. There may be multiple 

711 # bad refs so look for all of them. 

712 unresolved_paths = {} 

713 for dataset in datasets: 

714 unresolved = [] 

715 for ref in dataset.refs: 

716 if ref.id is None: 

717 unresolved.append(ref) 

718 if unresolved: 

719 unresolved_paths[dataset.path] = unresolved 

720 raise RuntimeError( 

721 "Attempt to ingest unresolved DatasetRef from: " 

722 + ",".join(f"{p}: ({[str(r) for r in ref]})" for p, ref in unresolved_paths.items()) 

723 ) 

724 if refs.keys() != prepData.refs.keys(): 

725 unsupported = refs.keys() - prepData.refs.keys() 

726 # Group unsupported refs by DatasetType for an informative 

727 # but still concise error message. 

728 byDatasetType = defaultdict(list) 

729 for datasetId in unsupported: 

730 ref = refs[datasetId] 

731 byDatasetType[ref.datasetType].append(ref) 

732 raise DatasetTypeNotSupportedError( 

733 "DatasetType(s) not supported in ingest: " 

734 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items()) 

735 ) 

736 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info) 

737 

738 def transfer_from( 

739 self, 

740 source_datastore: Datastore, 

741 refs: Iterable[DatasetRef], 

742 transfer: str = "auto", 

743 artifact_existence: Optional[Dict[ResourcePath, bool]] = None, 

744 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

745 """Transfer dataset artifacts from another datastore to this one. 

746 

747 Parameters 

748 ---------- 

749 source_datastore : `Datastore` 

750 The datastore from which to transfer artifacts. That datastore 

751 must be compatible with this datastore receiving the artifacts. 

752 refs : iterable of `DatasetRef` 

753 The datasets to transfer from the source datastore. 

754 transfer : `str`, optional 

755 How (and whether) the dataset should be added to the datastore. 

756 Choices include "move", "copy", 

757 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

758 special transfer mode that will first try to make a hardlink and 

759 if that fails a symlink will be used instead. "relsymlink" creates 

760 a relative symlink rather than use an absolute path. 

761 Most datastores do not support all transfer modes. 

762 "auto" (the default) is a special option that will let the 

763 data store choose the most natural option for itself. 

764 If the source location and transfer location are identical the 

765 transfer mode will be ignored. 

766 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

767 Optional mapping of datastore artifact to existence. Updated by 

768 this method with details of all artifacts tested. Can be `None` 

769 if the caller is not interested. 

770 

771 Returns 

772 ------- 

773 accepted : `set` [`DatasetRef`] 

774 The datasets that were transferred. 

775 rejected : `set` [`DatasetRef`] 

776 The datasets that were rejected due to a constraints violation. 

777 

778 Raises 

779 ------ 

780 TypeError 

781 Raised if the two datastores are not compatible. 

782 """ 

783 if type(self) is not type(source_datastore): 

784 raise TypeError( 

785 f"Datastore mismatch between this datastore ({type(self)}) and the " 

786 f"source datastore ({type(source_datastore)})." 

787 ) 

788 

789 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.") 

790 

791 def getManyURIs( 

792 self, 

793 refs: Iterable[DatasetRef], 

794 predict: bool = False, 

795 allow_missing: bool = False, 

796 ) -> Dict[DatasetRef, DatasetRefURIs]: 

797 """Return URIs associated with many datasets. 

798 

799 Parameters 

800 ---------- 

801 refs : iterable of `DatasetIdRef` 

802 References to the required datasets. 

803 predict : `bool`, optional 

804 If the datastore does not know about a dataset, should it 

805 return a predicted URI or not? 

806 allow_missing : `bool` 

807 If `False`, and `predict` is `False`, will raise if a `DatasetRef` 

808 does not exist. 

809 

810 Returns 

811 ------- 

812 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`] 

813 A dict of primary and component URIs, indexed by the passed-in 

814 refs. 

815 

816 Raises 

817 ------ 

818 FileNotFoundError 

819 A URI has been requested for a dataset that does not exist and 

820 guessing is not allowed. 

821 

822 Notes 

823 ----- 

824 In file-based datastores, getManuURIs does not check that the file is 

825 really there, it's assuming it is if datastore is aware of the file 

826 then it actually exists. 

827 """ 

828 uris: Dict[DatasetRef, DatasetRefURIs] = {} 

829 missing_refs = [] 

830 for ref in refs: 

831 try: 

832 uris[ref] = self.getURIs(ref, predict=predict) 

833 except FileNotFoundError: 

834 missing_refs.append(ref) 

835 if missing_refs and not allow_missing: 

836 raise FileNotFoundError( 

837 "Missing {} refs from datastore out of {} and predict=False.".format( 

838 num_missing := len(missing_refs), num_missing + len(uris) 

839 ) 

840 ) 

841 return uris 

842 

843 @abstractmethod 

844 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

845 """Return URIs associated with dataset. 

846 

847 Parameters 

848 ---------- 

849 ref : `DatasetRef` 

850 Reference to the required dataset. 

851 predict : `bool`, optional 

852 If the datastore does not know about the dataset, should it 

853 return a predicted URI or not? 

854 

855 Returns 

856 ------- 

857 uris : `DatasetRefURIs` 

858 The URI to the primary artifact associated with this dataset (if 

859 the dataset was disassembled within the datastore this may be 

860 `None`), and the URIs to any components associated with the dataset 

861 artifact. (can be empty if there are no components). 

862 """ 

863 raise NotImplementedError() 

864 

865 @abstractmethod 

866 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath: 

867 """URI to the Dataset. 

868 

869 Parameters 

870 ---------- 

871 datasetRef : `DatasetRef` 

872 Reference to the required Dataset. 

873 predict : `bool` 

874 If `True` attempt to predict the URI for a dataset if it does 

875 not exist in datastore. 

876 

877 Returns 

878 ------- 

879 uri : `str` 

880 URI string pointing to the Dataset within the datastore. If the 

881 Dataset does not exist in the datastore, the URI may be a guess. 

882 If the datastore does not have entities that relate well 

883 to the concept of a URI the returned URI string will be 

884 descriptive. The returned URI is not guaranteed to be obtainable. 

885 

886 Raises 

887 ------ 

888 FileNotFoundError 

889 A URI has been requested for a dataset that does not exist and 

890 guessing is not allowed. 

891 """ 

892 raise NotImplementedError("Must be implemented by subclass") 

893 

894 @abstractmethod 

895 def retrieveArtifacts( 

896 self, 

897 refs: Iterable[DatasetRef], 

898 destination: ResourcePath, 

899 transfer: str = "auto", 

900 preserve_path: bool = True, 

901 overwrite: bool = False, 

902 ) -> List[ResourcePath]: 

903 """Retrieve the artifacts associated with the supplied refs. 

904 

905 Parameters 

906 ---------- 

907 refs : iterable of `DatasetRef` 

908 The datasets for which artifacts are to be retrieved. 

909 A single ref can result in multiple artifacts. The refs must 

910 be resolved. 

911 destination : `lsst.resources.ResourcePath` 

912 Location to write the artifacts. 

913 transfer : `str`, optional 

914 Method to use to transfer the artifacts. Must be one of the options 

915 supported by `lsst.resources.ResourcePath.transfer_from()`. 

916 "move" is not allowed. 

917 preserve_path : `bool`, optional 

918 If `True` the full path of the artifact within the datastore 

919 is preserved. If `False` the final file component of the path 

920 is used. 

921 overwrite : `bool`, optional 

922 If `True` allow transfers to overwrite existing files at the 

923 destination. 

924 

925 Returns 

926 ------- 

927 targets : `list` of `lsst.resources.ResourcePath` 

928 URIs of file artifacts in destination location. Order is not 

929 preserved. 

930 

931 Notes 

932 ----- 

933 For non-file datastores the artifacts written to the destination 

934 may not match the representation inside the datastore. For example 

935 a hierarchichal data structure in a NoSQL database may well be stored 

936 as a JSON file. 

937 """ 

938 raise NotImplementedError() 

939 

940 @abstractmethod 

941 def remove(self, datasetRef: DatasetRef) -> None: 

942 """Indicate to the Datastore that a Dataset can be removed. 

943 

944 Parameters 

945 ---------- 

946 datasetRef : `DatasetRef` 

947 Reference to the required Dataset. 

948 

949 Raises 

950 ------ 

951 FileNotFoundError 

952 When Dataset does not exist. 

953 

954 Notes 

955 ----- 

956 Some Datastores may implement this method as a silent no-op to 

957 disable Dataset deletion through standard interfaces. 

958 """ 

959 raise NotImplementedError("Must be implemented by subclass") 

960 

961 @abstractmethod 

962 def forget(self, refs: Iterable[DatasetRef]) -> None: 

963 """Indicate to the Datastore that it should remove all records of the 

964 given datasets, without actually deleting them. 

965 

966 Parameters 

967 ---------- 

968 refs : `Iterable` [ `DatasetRef` ] 

969 References to the datasets being forgotten. 

970 

971 Notes 

972 ----- 

973 Asking a datastore to forget a `DatasetRef` it does not hold should be 

974 a silent no-op, not an error. 

975 """ 

976 raise NotImplementedError("Must be implemented by subclass") 

977 

978 @abstractmethod 

979 def trash(self, ref: Union[DatasetRef, Iterable[DatasetRef]], ignore_errors: bool = True) -> None: 

980 """Indicate to the Datastore that a Dataset can be moved to the trash. 

981 

982 Parameters 

983 ---------- 

984 ref : `DatasetRef` or iterable thereof 

985 Reference(s) to the required Dataset. 

986 ignore_errors : `bool`, optional 

987 Determine whether errors should be ignored. When multiple 

988 refs are being trashed there will be no per-ref check. 

989 

990 Raises 

991 ------ 

992 FileNotFoundError 

993 When Dataset does not exist and errors are not ignored. Only 

994 checked if a single ref is supplied (and not in a list). 

995 

996 Notes 

997 ----- 

998 Some Datastores may implement this method as a silent no-op to 

999 disable Dataset deletion through standard interfaces. 

1000 """ 

1001 raise NotImplementedError("Must be implemented by subclass") 

1002 

1003 @abstractmethod 

1004 def emptyTrash(self, ignore_errors: bool = True) -> None: 

1005 """Remove all datasets from the trash. 

1006 

1007 Parameters 

1008 ---------- 

1009 ignore_errors : `bool`, optional 

1010 Determine whether errors should be ignored. 

1011 

1012 Notes 

1013 ----- 

1014 Some Datastores may implement this method as a silent no-op to 

1015 disable Dataset deletion through standard interfaces. 

1016 """ 

1017 raise NotImplementedError("Must be implemented by subclass") 

1018 

1019 @abstractmethod 

1020 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None: 

1021 """Transfer a dataset from another datastore to this datastore. 

1022 

1023 Parameters 

1024 ---------- 

1025 inputDatastore : `Datastore` 

1026 The external `Datastore` from which to retrieve the Dataset. 

1027 datasetRef : `DatasetRef` 

1028 Reference to the required Dataset. 

1029 """ 

1030 raise NotImplementedError("Must be implemented by subclass") 

1031 

1032 def export( 

1033 self, 

1034 refs: Iterable[DatasetRef], 

1035 *, 

1036 directory: Optional[ResourcePathExpression] = None, 

1037 transfer: Optional[str] = "auto", 

1038 ) -> Iterable[FileDataset]: 

1039 """Export datasets for transfer to another data repository. 

1040 

1041 Parameters 

1042 ---------- 

1043 refs : iterable of `DatasetRef` 

1044 Dataset references to be exported. 

1045 directory : `str`, optional 

1046 Path to a directory that should contain files corresponding to 

1047 output datasets. Ignored if ``transfer`` is explicitly `None`. 

1048 transfer : `str`, optional 

1049 Mode that should be used to move datasets out of the repository. 

1050 Valid options are the same as those of the ``transfer`` argument 

1051 to ``ingest``, and datastores may similarly signal that a transfer 

1052 mode is not supported by raising `NotImplementedError`. If "auto" 

1053 is given and no ``directory`` is specified, `None` will be 

1054 implied. 

1055 

1056 Returns 

1057 ------- 

1058 dataset : iterable of `DatasetTransfer` 

1059 Structs containing information about the exported datasets, in the 

1060 same order as ``refs``. 

1061 

1062 Raises 

1063 ------ 

1064 NotImplementedError 

1065 Raised if the given transfer mode is not supported. 

1066 """ 

1067 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

1068 

1069 @abstractmethod 

1070 def validateConfiguration( 

1071 self, entities: Iterable[Union[DatasetRef, DatasetType, StorageClass]], logFailures: bool = False 

1072 ) -> None: 

1073 """Validate some of the configuration for this datastore. 

1074 

1075 Parameters 

1076 ---------- 

1077 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1078 Entities to test against this configuration. Can be differing 

1079 types. 

1080 logFailures : `bool`, optional 

1081 If `True`, output a log message for every validation error 

1082 detected. 

1083 

1084 Raises 

1085 ------ 

1086 DatastoreValidationError 

1087 Raised if there is a validation problem with a configuration. 

1088 

1089 Notes 

1090 ----- 

1091 Which parts of the configuration are validated is at the discretion 

1092 of each Datastore implementation. 

1093 """ 

1094 raise NotImplementedError("Must be implemented by subclass") 

1095 

1096 @abstractmethod 

1097 def validateKey(self, lookupKey: LookupKey, entity: Union[DatasetRef, DatasetType, StorageClass]) -> None: 

1098 """Validate a specific look up key with supplied entity. 

1099 

1100 Parameters 

1101 ---------- 

1102 lookupKey : `LookupKey` 

1103 Key to use to retrieve information from the datastore 

1104 configuration. 

1105 entity : `DatasetRef`, `DatasetType`, or `StorageClass` 

1106 Entity to compare with configuration retrieved using the 

1107 specified lookup key. 

1108 

1109 Raises 

1110 ------ 

1111 DatastoreValidationError 

1112 Raised if there is a problem with the combination of entity 

1113 and lookup key. 

1114 

1115 Notes 

1116 ----- 

1117 Bypasses the normal selection priorities by allowing a key that 

1118 would normally not be selected to be validated. 

1119 """ 

1120 raise NotImplementedError("Must be implemented by subclass") 

1121 

1122 @abstractmethod 

1123 def getLookupKeys(self) -> Set[LookupKey]: 

1124 """Return all the lookup keys relevant to this datastore. 

1125 

1126 Returns 

1127 ------- 

1128 keys : `set` of `LookupKey` 

1129 The keys stored internally for looking up information based 

1130 on `DatasetType` name or `StorageClass`. 

1131 """ 

1132 raise NotImplementedError("Must be implemented by subclass") 

1133 

1134 def needs_expanded_data_ids( 

1135 self, 

1136 transfer: Optional[str], 

1137 entity: Optional[Union[DatasetRef, DatasetType, StorageClass]] = None, 

1138 ) -> bool: 

1139 """Test whether this datastore needs expanded data IDs to ingest. 

1140 

1141 Parameters 

1142 ---------- 

1143 transfer : `str` or `None` 

1144 Transfer mode for ingest. 

1145 entity, optional 

1146 Object representing what will be ingested. If not provided (or not 

1147 specific enough), `True` may be returned even if expanded data 

1148 IDs aren't necessary. 

1149 

1150 Returns 

1151 ------- 

1152 needed : `bool` 

1153 If `True`, expanded data IDs may be needed. `False` only if 

1154 expansion definitely isn't necessary. 

1155 """ 

1156 return True 

1157 

1158 @abstractmethod 

1159 def import_records( 

1160 self, 

1161 data: Mapping[str, DatastoreRecordData], 

1162 ) -> None: 

1163 """Import datastore location and record data from an in-memory data 

1164 structure. 

1165 

1166 Parameters 

1167 ---------- 

1168 data : `Mapping` [ `str`, `DatastoreRecordData` ] 

1169 Datastore records indexed by datastore name. May contain data for 

1170 other `Datastore` instances (generally because they are chained to 

1171 this one), which should be ignored. 

1172 

1173 Notes 

1174 ----- 

1175 Implementations should generally not check that any external resources 

1176 (e.g. files) referred to by these records actually exist, for 

1177 performance reasons; we expect higher-level code to guarantee that they 

1178 do. 

1179 

1180 Implementations are responsible for calling 

1181 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations`` 

1182 where the key is in `names`, as well as loading any opaque table data. 

1183 """ 

1184 raise NotImplementedError() 

1185 

1186 @abstractmethod 

1187 def export_records( 

1188 self, 

1189 refs: Iterable[DatasetIdRef], 

1190 ) -> Mapping[str, DatastoreRecordData]: 

1191 """Export datastore records and locations to an in-memory data 

1192 structure. 

1193 

1194 Parameters 

1195 ---------- 

1196 refs : `Iterable` [ `DatasetIdRef` ] 

1197 Datasets to save. This may include datasets not known to this 

1198 datastore, which should be ignored. 

1199 

1200 Returns 

1201 ------- 

1202 data : `Mapping` [ `str`, `DatastoreRecordData` ] 

1203 Exported datastore records indexed by datastore name. 

1204 """ 

1205 raise NotImplementedError() 

1206 

1207 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

1208 """Specify a method that can be used by datastore to retrieve 

1209 registry-defined dataset type. 

1210 

1211 Parameters 

1212 ---------- 

1213 method : `~collections.abc.Callable` | `None` 

1214 Method that takes a name of the dataset type and returns a 

1215 corresponding `DatasetType` instance as defined in Registry. If 

1216 dataset type name is not known to registry `None` is returned. 

1217 

1218 Notes 

1219 ----- 

1220 This method is only needed for a Datastore supporting a "trusted" mode 

1221 when it does not have an access to datastore records and needs to 

1222 guess dataset location based on its stored dataset type. 

1223 """ 

1224 pass