Coverage for python/lsst/daf/butler/core/datastore.py: 51%

210 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-15 09:13 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Support for generic data stores.""" 

23 

24from __future__ import annotations 

25 

26__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs") 

27 

28import contextlib 

29import dataclasses 

30import logging 

31from abc import ABCMeta, abstractmethod 

32from collections import abc, defaultdict 

33from collections.abc import Callable, Iterable, Iterator, Mapping 

34from typing import TYPE_CHECKING, Any, ClassVar 

35 

36from lsst.utils import doImportType 

37 

38from .config import Config, ConfigSubset 

39from .constraints import Constraints 

40from .exceptions import DatasetTypeNotSupportedError, ValidationError 

41from .fileDataset import FileDataset 

42from .storageClass import StorageClassFactory 

43 

44if TYPE_CHECKING: 

45 from lsst.resources import ResourcePath, ResourcePathExpression 

46 

47 from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager 

48 from .configSupport import LookupKey 

49 from .datasets import DatasetRef, DatasetType 

50 from .datastoreRecordData import DatastoreRecordData 

51 from .storageClass import StorageClass 

52 

53 

54class DatastoreConfig(ConfigSubset): 

55 """Configuration for Datastores.""" 

56 

57 component = "datastore" 

58 requiredKeys = ("cls",) 

59 defaultConfigFile = "datastore.yaml" 

60 

61 

62class DatastoreValidationError(ValidationError): 

63 """There is a problem with the Datastore configuration.""" 

64 

65 pass 

66 

67 

68@dataclasses.dataclass(frozen=True) 

69class Event: 

70 __slots__ = {"name", "undoFunc", "args", "kwargs"} 

71 name: str 

72 undoFunc: Callable 

73 args: tuple 

74 kwargs: dict 

75 

76 

77class IngestPrepData: 

78 """A helper base class for `Datastore` ingest implementations. 

79 

80 Datastore implementations will generally need a custom implementation of 

81 this class. 

82 

83 Should be accessed as ``Datastore.IngestPrepData`` instead of via direct 

84 import. 

85 

86 Parameters 

87 ---------- 

88 refs : iterable of `DatasetRef` 

89 References for the datasets that can be ingested by this datastore. 

90 """ 

91 

92 def __init__(self, refs: Iterable[DatasetRef]): 

93 self.refs = {ref.id: ref for ref in refs} 

94 

95 

96class DatastoreTransaction: 

97 """Keeps a log of `Datastore` activity and allow rollback. 

98 

99 Parameters 

100 ---------- 

101 parent : `DatastoreTransaction`, optional 

102 The parent transaction (if any) 

103 """ 

104 

105 Event: ClassVar[type] = Event 

106 

107 parent: DatastoreTransaction | None 

108 """The parent transaction. (`DatastoreTransaction`, optional)""" 

109 

110 def __init__(self, parent: DatastoreTransaction | None = None): 

111 self.parent = parent 

112 self._log: list[Event] = [] 

113 

114 def registerUndo(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> None: 

115 """Register event with undo function. 

116 

117 Parameters 

118 ---------- 

119 name : `str` 

120 Name of the event. 

121 undoFunc : func 

122 Function to undo this event. 

123 args : `tuple` 

124 Positional arguments to `undoFunc`. 

125 **kwargs 

126 Keyword arguments to `undoFunc`. 

127 """ 

128 self._log.append(self.Event(name, undoFunc, args, kwargs)) 

129 

130 @contextlib.contextmanager 

131 def undoWith(self, name: str, undoFunc: Callable, *args: Any, **kwargs: Any) -> Iterator[None]: 

132 """Register undo function if nested operation succeeds. 

133 

134 Calls `registerUndo`. 

135 

136 This can be used to wrap individual undo-able statements within a 

137 DatastoreTransaction block. Multiple statements that can fail 

138 separately should not be part of the same `undoWith` block. 

139 

140 All arguments are forwarded directly to `registerUndo`. 

141 """ 

142 try: 

143 yield None 

144 except BaseException: 

145 raise 

146 else: 

147 self.registerUndo(name, undoFunc, *args, **kwargs) 

148 

149 def rollback(self) -> None: 

150 """Roll back all events in this transaction.""" 

151 log = logging.getLogger(__name__) 

152 while self._log: 

153 ev = self._log.pop() 

154 try: 

155 log.debug( 

156 "Rolling back transaction: %s: %s(%s,%s)", 

157 ev.name, 

158 ev.undoFunc, 

159 ",".join(str(a) for a in ev.args), 

160 ",".join(f"{k}={v}" for k, v in ev.kwargs.items()), 

161 ) 

162 except Exception: 

163 # In case we had a problem in stringification of arguments 

164 log.warning("Rolling back transaction: %s", ev.name) 

165 try: 

166 ev.undoFunc(*ev.args, **ev.kwargs) 

167 except BaseException as e: 

168 # Deliberately swallow error that may occur in unrolling 

169 log.warning("Exception: %s caught while unrolling: %s", e, ev.name) 

170 pass 

171 

172 def commit(self) -> None: 

173 """Commit this transaction.""" 

174 if self.parent is None: 

175 # Just forget about the events, they have already happened. 

176 return 

177 else: 

178 # We may still want to events from this transaction as part of 

179 # the parent. 

180 self.parent._log.extend(self._log) 

181 

182 

183@dataclasses.dataclass 

184class DatasetRefURIs(abc.Sequence): 

185 """Represents the primary and component ResourcePath(s) associated with a 

186 DatasetRef. 

187 

188 This is used in places where its members used to be represented as a tuple 

189 `(primaryURI, componentURIs)`. To maintain backward compatibility this 

190 inherits from Sequence and so instances can be treated as a two-item 

191 tuple. 

192 """ 

193 

194 def __init__( 

195 self, 

196 primaryURI: ResourcePath | None = None, 

197 componentURIs: dict[str, ResourcePath] | None = None, 

198 ): 

199 self.primaryURI = primaryURI 

200 """The URI to the primary artifact associated with this dataset. If the 

201 dataset was disassembled within the datastore this may be `None`. 

202 """ 

203 

204 self.componentURIs = componentURIs or {} 

205 """The URIs to any components associated with the dataset artifact 

206 indexed by component name. This can be empty if there are no 

207 components. 

208 """ 

209 

210 def __getitem__(self, index: Any) -> Any: 

211 """Get primaryURI and componentURIs by index. 

212 

213 Provides support for tuple-like access. 

214 """ 

215 if index == 0: 

216 return self.primaryURI 

217 elif index == 1: 

218 return self.componentURIs 

219 raise IndexError("list index out of range") 

220 

221 def __len__(self) -> int: 

222 """Get the number of data members. 

223 

224 Provides support for tuple-like access. 

225 """ 

226 return 2 

227 

228 def __repr__(self) -> str: 

229 return f"DatasetRefURIs({repr(self.primaryURI)}, {repr(self.componentURIs)})" 

230 

231 

232class Datastore(metaclass=ABCMeta): 

233 """Datastore interface. 

234 

235 Parameters 

236 ---------- 

237 config : `DatastoreConfig` or `str` 

238 Load configuration either from an existing config instance or by 

239 referring to a configuration file. 

240 bridgeManager : `DatastoreRegistryBridgeManager` 

241 Object that manages the interface between `Registry` and datastores. 

242 butlerRoot : `str`, optional 

243 New datastore root to use to override the configuration value. 

244 """ 

245 

246 defaultConfigFile: ClassVar[str | None] = None 

247 """Path to configuration defaults. Accessed within the ``config`` resource 

248 or relative to a search path. Can be None if no defaults specified. 

249 """ 

250 

251 containerKey: ClassVar[str | None] = None 

252 """Name of the key containing a list of subconfigurations that also 

253 need to be merged with defaults and will likely use different Python 

254 datastore classes (but all using DatastoreConfig). Assumed to be a 

255 list of configurations that can be represented in a DatastoreConfig 

256 and containing a "cls" definition. None indicates that no containers 

257 are expected in this Datastore.""" 

258 

259 isEphemeral: bool = False 

260 """Indicate whether this Datastore is ephemeral or not. An ephemeral 

261 datastore is one where the contents of the datastore will not exist 

262 across process restarts. This value can change per-instance.""" 

263 

264 config: DatastoreConfig 

265 """Configuration used to create Datastore.""" 

266 

267 name: str 

268 """Label associated with this Datastore.""" 

269 

270 storageClassFactory: StorageClassFactory 

271 """Factory for creating storage class instances from name.""" 

272 

273 constraints: Constraints 

274 """Constraints to apply when putting datasets into the datastore.""" 

275 

276 # MyPy does not like for this to be annotated as any kind of type, because 

277 # it can't do static checking on type variables that can change at runtime. 

278 IngestPrepData: ClassVar[Any] = IngestPrepData 

279 """Helper base class for ingest implementations. 

280 """ 

281 

282 @classmethod 

283 @abstractmethod 

284 def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: 

285 """Set filesystem-dependent config options for this datastore. 

286 

287 The options will be appropriate for a new empty repository with the 

288 given root. 

289 

290 Parameters 

291 ---------- 

292 root : `str` 

293 Filesystem path to the root of the data repository. 

294 config : `Config` 

295 A `Config` to update. Only the subset understood by 

296 this component will be updated. Will not expand 

297 defaults. 

298 full : `Config` 

299 A complete config with all defaults expanded that can be 

300 converted to a `DatastoreConfig`. Read-only and will not be 

301 modified by this method. 

302 Repository-specific options that should not be obtained 

303 from defaults when Butler instances are constructed 

304 should be copied from ``full`` to ``config``. 

305 overwrite : `bool`, optional 

306 If `False`, do not modify a value in ``config`` if the value 

307 already exists. Default is always to overwrite with the provided 

308 ``root``. 

309 

310 Notes 

311 ----- 

312 If a keyword is explicitly defined in the supplied ``config`` it 

313 will not be overridden by this method if ``overwrite`` is `False`. 

314 This allows explicit values set in external configs to be retained. 

315 """ 

316 raise NotImplementedError() 

317 

318 @staticmethod 

319 def fromConfig( 

320 config: Config, 

321 bridgeManager: DatastoreRegistryBridgeManager, 

322 butlerRoot: ResourcePathExpression | None = None, 

323 ) -> Datastore: 

324 """Create datastore from type specified in config file. 

325 

326 Parameters 

327 ---------- 

328 config : `Config` or `~lsst.resources.ResourcePathExpression` 

329 Configuration instance. 

330 bridgeManager : `DatastoreRegistryBridgeManager` 

331 Object that manages the interface between `Registry` and 

332 datastores. 

333 butlerRoot : `str`, optional 

334 Butler root directory. 

335 """ 

336 cls = doImportType(config["datastore", "cls"]) 

337 if not issubclass(cls, Datastore): 

338 raise TypeError(f"Imported child class {config['datastore', 'cls']} is not a Datastore") 

339 return cls(config=config, bridgeManager=bridgeManager, butlerRoot=butlerRoot) 

340 

341 def __init__( 

342 self, 

343 config: Config | ResourcePathExpression, 

344 bridgeManager: DatastoreRegistryBridgeManager, 

345 butlerRoot: ResourcePathExpression | None = None, 

346 ): 

347 self.config = DatastoreConfig(config) 

348 self.name = "ABCDataStore" 

349 self._transaction: DatastoreTransaction | None = None 

350 

351 # All Datastores need storage classes and constraints 

352 self.storageClassFactory = StorageClassFactory() 

353 

354 # And read the constraints list 

355 constraintsConfig = self.config.get("constraints") 

356 self.constraints = Constraints(constraintsConfig, universe=bridgeManager.universe) 

357 

358 def __str__(self) -> str: 

359 return self.name 

360 

361 def __repr__(self) -> str: 

362 return self.name 

363 

364 @property 

365 def names(self) -> tuple[str, ...]: 

366 """Names associated with this datastore returned as a list. 

367 

368 Can be different to ``name`` for a chaining datastore. 

369 """ 

370 # Default implementation returns solely the name itself 

371 return (self.name,) 

372 

373 @contextlib.contextmanager 

374 def transaction(self) -> Iterator[DatastoreTransaction]: 

375 """Context manager supporting `Datastore` transactions. 

376 

377 Transactions can be nested, and are to be used in combination with 

378 `Registry.transaction`. 

379 """ 

380 self._transaction = DatastoreTransaction(self._transaction) 

381 try: 

382 yield self._transaction 

383 except BaseException: 

384 self._transaction.rollback() 

385 raise 

386 else: 

387 self._transaction.commit() 

388 self._transaction = self._transaction.parent 

389 

390 @abstractmethod 

391 def knows(self, ref: DatasetRef) -> bool: 

392 """Check if the dataset is known to the datastore. 

393 

394 Does not check for existence of any artifact. 

395 

396 Parameters 

397 ---------- 

398 ref : `DatasetRef` 

399 Reference to the required dataset. 

400 

401 Returns 

402 ------- 

403 exists : `bool` 

404 `True` if the dataset is known to the datastore. 

405 """ 

406 raise NotImplementedError() 

407 

408 def knows_these(self, refs: Iterable[DatasetRef]) -> dict[DatasetRef, bool]: 

409 """Check which of the given datasets are known to this datastore. 

410 

411 This is like ``mexist()`` but does not check that the file exists. 

412 

413 Parameters 

414 ---------- 

415 refs : iterable `DatasetRef` 

416 The datasets to check. 

417 

418 Returns 

419 ------- 

420 exists : `dict`[`DatasetRef`, `bool`] 

421 Mapping of dataset to boolean indicating whether the dataset 

422 is known to the datastore. 

423 """ 

424 # Non-optimized default calls knows() repeatedly. 

425 return {ref: self.knows(ref) for ref in refs} 

426 

427 def mexists( 

428 self, refs: Iterable[DatasetRef], artifact_existence: dict[ResourcePath, bool] | None = None 

429 ) -> dict[DatasetRef, bool]: 

430 """Check the existence of multiple datasets at once. 

431 

432 Parameters 

433 ---------- 

434 refs : iterable of `DatasetRef` 

435 The datasets to be checked. 

436 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

437 Optional mapping of datastore artifact to existence. Updated by 

438 this method with details of all artifacts tested. Can be `None` 

439 if the caller is not interested. 

440 

441 Returns 

442 ------- 

443 existence : `dict` of [`DatasetRef`, `bool`] 

444 Mapping from dataset to boolean indicating existence. 

445 """ 

446 existence: dict[DatasetRef, bool] = {} 

447 # Non-optimized default. 

448 for ref in refs: 

449 existence[ref] = self.exists(ref) 

450 return existence 

451 

452 @abstractmethod 

453 def exists(self, datasetRef: DatasetRef) -> bool: 

454 """Check if the dataset exists in the datastore. 

455 

456 Parameters 

457 ---------- 

458 datasetRef : `DatasetRef` 

459 Reference to the required dataset. 

460 

461 Returns 

462 ------- 

463 exists : `bool` 

464 `True` if the entity exists in the `Datastore`. 

465 """ 

466 raise NotImplementedError("Must be implemented by subclass") 

467 

468 @abstractmethod 

469 def get( 

470 self, 

471 datasetRef: DatasetRef, 

472 parameters: Mapping[str, Any] | None = None, 

473 storageClass: StorageClass | str | None = None, 

474 ) -> Any: 

475 """Load an `InMemoryDataset` from the store. 

476 

477 Parameters 

478 ---------- 

479 datasetRef : `DatasetRef` 

480 Reference to the required Dataset. 

481 parameters : `dict` 

482 `StorageClass`-specific parameters that specify a slice of the 

483 Dataset to be loaded. 

484 storageClass : `StorageClass` or `str`, optional 

485 The storage class to be used to override the Python type 

486 returned by this method. By default the returned type matches 

487 the dataset type definition for this dataset. Specifying a 

488 read `StorageClass` can force a different type to be returned. 

489 This type must be compatible with the original type. 

490 

491 Returns 

492 ------- 

493 inMemoryDataset : `object` 

494 Requested Dataset or slice thereof as an InMemoryDataset. 

495 """ 

496 raise NotImplementedError("Must be implemented by subclass") 

497 

498 @abstractmethod 

499 def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None: 

500 """Write a `InMemoryDataset` with a given `DatasetRef` to the store. 

501 

502 Parameters 

503 ---------- 

504 inMemoryDataset : `object` 

505 The Dataset to store. 

506 datasetRef : `DatasetRef` 

507 Reference to the associated Dataset. 

508 """ 

509 raise NotImplementedError("Must be implemented by subclass") 

510 

511 def _overrideTransferMode(self, *datasets: FileDataset, transfer: str | None = None) -> str | None: 

512 """Allow ingest transfer mode to be defaulted based on datasets. 

513 

514 Parameters 

515 ---------- 

516 datasets : `FileDataset` 

517 Each positional argument is a struct containing information about 

518 a file to be ingested, including its path (either absolute or 

519 relative to the datastore root, if applicable), a complete 

520 `DatasetRef` (with ``dataset_id not None``), and optionally a 

521 formatter class or its fully-qualified string name. If a formatter 

522 is not provided, this method should populate that attribute with 

523 the formatter the datastore would use for `put`. Subclasses are 

524 also permitted to modify the path attribute (typically to put it 

525 in what the datastore considers its standard form). 

526 transfer : `str`, optional 

527 How (and whether) the dataset should be added to the datastore. 

528 See `ingest` for details of transfer modes. 

529 

530 Returns 

531 ------- 

532 newTransfer : `str` 

533 Transfer mode to use. Will be identical to the supplied transfer 

534 mode unless "auto" is used. 

535 """ 

536 if transfer != "auto": 

537 return transfer 

538 raise RuntimeError(f"{transfer} is not allowed without specialization.") 

539 

540 def _prepIngest(self, *datasets: FileDataset, transfer: str | None = None) -> IngestPrepData: 

541 """Process datasets to identify which ones can be ingested. 

542 

543 Parameters 

544 ---------- 

545 datasets : `FileDataset` 

546 Each positional argument is a struct containing information about 

547 a file to be ingested, including its path (either absolute or 

548 relative to the datastore root, if applicable), a complete 

549 `DatasetRef` (with ``dataset_id not None``), and optionally a 

550 formatter class or its fully-qualified string name. If a formatter 

551 is not provided, this method should populate that attribute with 

552 the formatter the datastore would use for `put`. Subclasses are 

553 also permitted to modify the path attribute (typically to put it 

554 in what the datastore considers its standard form). 

555 transfer : `str`, optional 

556 How (and whether) the dataset should be added to the datastore. 

557 See `ingest` for details of transfer modes. 

558 

559 Returns 

560 ------- 

561 data : `IngestPrepData` 

562 An instance of a subclass of `IngestPrepData`, used to pass 

563 arbitrary data from `_prepIngest` to `_finishIngest`. This should 

564 include only the datasets this datastore can actually ingest; 

565 others should be silently ignored (`Datastore.ingest` will inspect 

566 `IngestPrepData.refs` and raise `DatasetTypeNotSupportedError` if 

567 necessary). 

568 

569 Raises 

570 ------ 

571 NotImplementedError 

572 Raised if the datastore does not support the given transfer mode 

573 (including the case where ingest is not supported at all). 

574 FileNotFoundError 

575 Raised if one of the given files does not exist. 

576 FileExistsError 

577 Raised if transfer is not `None` but the (internal) location the 

578 file would be moved to is already occupied. 

579 

580 Notes 

581 ----- 

582 This method (along with `_finishIngest`) should be implemented by 

583 subclasses to provide ingest support instead of implementing `ingest` 

584 directly. 

585 

586 `_prepIngest` should not modify the data repository or given files in 

587 any way; all changes should be deferred to `_finishIngest`. 

588 

589 When possible, exceptions should be raised in `_prepIngest` instead of 

590 `_finishIngest`. `NotImplementedError` exceptions that indicate that 

591 the transfer mode is not supported must be raised by `_prepIngest` 

592 instead of `_finishIngest`. 

593 """ 

594 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

595 

596 def _finishIngest( 

597 self, prepData: IngestPrepData, *, transfer: str | None = None, record_validation_info: bool = True 

598 ) -> None: 

599 """Complete an ingest operation. 

600 

601 Parameters 

602 ---------- 

603 data : `IngestPrepData` 

604 An instance of a subclass of `IngestPrepData`. Guaranteed to be 

605 the direct result of a call to `_prepIngest` on this datastore. 

606 transfer : `str`, optional 

607 How (and whether) the dataset should be added to the datastore. 

608 See `ingest` for details of transfer modes. 

609 record_validation_info : `bool`, optional 

610 If `True`, the default, the datastore can record validation 

611 information associated with the file. If `False` the datastore 

612 will not attempt to track any information such as checksums 

613 or file sizes. This can be useful if such information is tracked 

614 in an external system or if the file is to be compressed in place. 

615 It is up to the datastore whether this parameter is relevant. 

616 

617 Raises 

618 ------ 

619 FileNotFoundError 

620 Raised if one of the given files does not exist. 

621 FileExistsError 

622 Raised if transfer is not `None` but the (internal) location the 

623 file would be moved to is already occupied. 

624 

625 Notes 

626 ----- 

627 This method (along with `_prepIngest`) should be implemented by 

628 subclasses to provide ingest support instead of implementing `ingest` 

629 directly. 

630 """ 

631 raise NotImplementedError(f"Datastore {self} does not support direct file-based ingest.") 

632 

633 def ingest( 

634 self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True 

635 ) -> None: 

636 """Ingest one or more files into the datastore. 

637 

638 Parameters 

639 ---------- 

640 datasets : `FileDataset` 

641 Each positional argument is a struct containing information about 

642 a file to be ingested, including its path (either absolute or 

643 relative to the datastore root, if applicable), a complete 

644 `DatasetRef` (with ``dataset_id not None``), and optionally a 

645 formatter class or its fully-qualified string name. If a formatter 

646 is not provided, the one the datastore would use for ``put`` on 

647 that dataset is assumed. 

648 transfer : `str`, optional 

649 How (and whether) the dataset should be added to the datastore. 

650 If `None` (default), the file must already be in a location 

651 appropriate for the datastore (e.g. within its root directory), 

652 and will not be modified. Other choices include "move", "copy", 

653 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

654 special transfer mode that will first try to make a hardlink and 

655 if that fails a symlink will be used instead. "relsymlink" creates 

656 a relative symlink rather than use an absolute path. 

657 Most datastores do not support all transfer modes. 

658 "auto" is a special option that will let the 

659 data store choose the most natural option for itself. 

660 record_validation_info : `bool`, optional 

661 If `True`, the default, the datastore can record validation 

662 information associated with the file. If `False` the datastore 

663 will not attempt to track any information such as checksums 

664 or file sizes. This can be useful if such information is tracked 

665 in an external system or if the file is to be compressed in place. 

666 It is up to the datastore whether this parameter is relevant. 

667 

668 Raises 

669 ------ 

670 NotImplementedError 

671 Raised if the datastore does not support the given transfer mode 

672 (including the case where ingest is not supported at all). 

673 DatasetTypeNotSupportedError 

674 Raised if one or more files to be ingested have a dataset type that 

675 is not supported by the datastore. 

676 FileNotFoundError 

677 Raised if one of the given files does not exist. 

678 FileExistsError 

679 Raised if transfer is not `None` but the (internal) location the 

680 file would be moved to is already occupied. 

681 

682 Notes 

683 ----- 

684 Subclasses should implement `_prepIngest` and `_finishIngest` instead 

685 of implementing `ingest` directly. Datastores that hold and 

686 delegate to child datastores may want to call those methods as well. 

687 

688 Subclasses are encouraged to document their supported transfer modes 

689 in their class documentation. 

690 """ 

691 # Allow a datastore to select a default transfer mode 

692 transfer = self._overrideTransferMode(*datasets, transfer=transfer) 

693 prepData = self._prepIngest(*datasets, transfer=transfer) 

694 refs = {ref.id: ref for dataset in datasets for ref in dataset.refs} 

695 if refs.keys() != prepData.refs.keys(): 

696 unsupported = refs.keys() - prepData.refs.keys() 

697 # Group unsupported refs by DatasetType for an informative 

698 # but still concise error message. 

699 byDatasetType = defaultdict(list) 

700 for datasetId in unsupported: 

701 ref = refs[datasetId] 

702 byDatasetType[ref.datasetType].append(ref) 

703 raise DatasetTypeNotSupportedError( 

704 "DatasetType(s) not supported in ingest: " 

705 + ", ".join(f"{k.name} ({len(v)} dataset(s))" for k, v in byDatasetType.items()) 

706 ) 

707 self._finishIngest(prepData, transfer=transfer, record_validation_info=record_validation_info) 

708 

709 def transfer_from( 

710 self, 

711 source_datastore: Datastore, 

712 refs: Iterable[DatasetRef], 

713 transfer: str = "auto", 

714 artifact_existence: dict[ResourcePath, bool] | None = None, 

715 ) -> tuple[set[DatasetRef], set[DatasetRef]]: 

716 """Transfer dataset artifacts from another datastore to this one. 

717 

718 Parameters 

719 ---------- 

720 source_datastore : `Datastore` 

721 The datastore from which to transfer artifacts. That datastore 

722 must be compatible with this datastore receiving the artifacts. 

723 refs : iterable of `DatasetRef` 

724 The datasets to transfer from the source datastore. 

725 transfer : `str`, optional 

726 How (and whether) the dataset should be added to the datastore. 

727 Choices include "move", "copy", 

728 "link", "symlink", "relsymlink", and "hardlink". "link" is a 

729 special transfer mode that will first try to make a hardlink and 

730 if that fails a symlink will be used instead. "relsymlink" creates 

731 a relative symlink rather than use an absolute path. 

732 Most datastores do not support all transfer modes. 

733 "auto" (the default) is a special option that will let the 

734 data store choose the most natural option for itself. 

735 If the source location and transfer location are identical the 

736 transfer mode will be ignored. 

737 artifact_existence : `dict` [`lsst.resources.ResourcePath`, `bool`] 

738 Optional mapping of datastore artifact to existence. Updated by 

739 this method with details of all artifacts tested. Can be `None` 

740 if the caller is not interested. 

741 

742 Returns 

743 ------- 

744 accepted : `set` [`DatasetRef`] 

745 The datasets that were transferred. 

746 rejected : `set` [`DatasetRef`] 

747 The datasets that were rejected due to a constraints violation. 

748 

749 Raises 

750 ------ 

751 TypeError 

752 Raised if the two datastores are not compatible. 

753 """ 

754 if type(self) is not type(source_datastore): 

755 raise TypeError( 

756 f"Datastore mismatch between this datastore ({type(self)}) and the " 

757 f"source datastore ({type(source_datastore)})." 

758 ) 

759 

760 raise NotImplementedError(f"Datastore {type(self)} must implement a transfer_from method.") 

761 

762 def getManyURIs( 

763 self, 

764 refs: Iterable[DatasetRef], 

765 predict: bool = False, 

766 allow_missing: bool = False, 

767 ) -> dict[DatasetRef, DatasetRefURIs]: 

768 """Return URIs associated with many datasets. 

769 

770 Parameters 

771 ---------- 

772 refs : iterable of `DatasetIdRef` 

773 References to the required datasets. 

774 predict : `bool`, optional 

775 If the datastore does not know about a dataset, should it 

776 return a predicted URI or not? 

777 allow_missing : `bool` 

778 If `False`, and `predict` is `False`, will raise if a `DatasetRef` 

779 does not exist. 

780 

781 Returns 

782 ------- 

783 URIs : `dict` of [`DatasetRef`, `DatasetRefUris`] 

784 A dict of primary and component URIs, indexed by the passed-in 

785 refs. 

786 

787 Raises 

788 ------ 

789 FileNotFoundError 

790 A URI has been requested for a dataset that does not exist and 

791 guessing is not allowed. 

792 

793 Notes 

794 ----- 

795 In file-based datastores, getManuURIs does not check that the file is 

796 really there, it's assuming it is if datastore is aware of the file 

797 then it actually exists. 

798 """ 

799 uris: dict[DatasetRef, DatasetRefURIs] = {} 

800 missing_refs = [] 

801 for ref in refs: 

802 try: 

803 uris[ref] = self.getURIs(ref, predict=predict) 

804 except FileNotFoundError: 

805 missing_refs.append(ref) 

806 if missing_refs and not allow_missing: 

807 raise FileNotFoundError( 

808 "Missing {} refs from datastore out of {} and predict=False.".format( 

809 num_missing := len(missing_refs), num_missing + len(uris) 

810 ) 

811 ) 

812 return uris 

813 

814 @abstractmethod 

815 def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs: 

816 """Return URIs associated with dataset. 

817 

818 Parameters 

819 ---------- 

820 ref : `DatasetRef` 

821 Reference to the required dataset. 

822 predict : `bool`, optional 

823 If the datastore does not know about the dataset, should it 

824 return a predicted URI or not? 

825 

826 Returns 

827 ------- 

828 uris : `DatasetRefURIs` 

829 The URI to the primary artifact associated with this dataset (if 

830 the dataset was disassembled within the datastore this may be 

831 `None`), and the URIs to any components associated with the dataset 

832 artifact. (can be empty if there are no components). 

833 """ 

834 raise NotImplementedError() 

835 

836 @abstractmethod 

837 def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath: 

838 """URI to the Dataset. 

839 

840 Parameters 

841 ---------- 

842 datasetRef : `DatasetRef` 

843 Reference to the required Dataset. 

844 predict : `bool` 

845 If `True` attempt to predict the URI for a dataset if it does 

846 not exist in datastore. 

847 

848 Returns 

849 ------- 

850 uri : `str` 

851 URI string pointing to the Dataset within the datastore. If the 

852 Dataset does not exist in the datastore, the URI may be a guess. 

853 If the datastore does not have entities that relate well 

854 to the concept of a URI the returned URI string will be 

855 descriptive. The returned URI is not guaranteed to be obtainable. 

856 

857 Raises 

858 ------ 

859 FileNotFoundError 

860 A URI has been requested for a dataset that does not exist and 

861 guessing is not allowed. 

862 """ 

863 raise NotImplementedError("Must be implemented by subclass") 

864 

865 @abstractmethod 

866 def retrieveArtifacts( 

867 self, 

868 refs: Iterable[DatasetRef], 

869 destination: ResourcePath, 

870 transfer: str = "auto", 

871 preserve_path: bool = True, 

872 overwrite: bool = False, 

873 ) -> list[ResourcePath]: 

874 """Retrieve the artifacts associated with the supplied refs. 

875 

876 Parameters 

877 ---------- 

878 refs : iterable of `DatasetRef` 

879 The datasets for which artifacts are to be retrieved. 

880 A single ref can result in multiple artifacts. The refs must 

881 be resolved. 

882 destination : `lsst.resources.ResourcePath` 

883 Location to write the artifacts. 

884 transfer : `str`, optional 

885 Method to use to transfer the artifacts. Must be one of the options 

886 supported by `lsst.resources.ResourcePath.transfer_from()`. 

887 "move" is not allowed. 

888 preserve_path : `bool`, optional 

889 If `True` the full path of the artifact within the datastore 

890 is preserved. If `False` the final file component of the path 

891 is used. 

892 overwrite : `bool`, optional 

893 If `True` allow transfers to overwrite existing files at the 

894 destination. 

895 

896 Returns 

897 ------- 

898 targets : `list` of `lsst.resources.ResourcePath` 

899 URIs of file artifacts in destination location. Order is not 

900 preserved. 

901 

902 Notes 

903 ----- 

904 For non-file datastores the artifacts written to the destination 

905 may not match the representation inside the datastore. For example 

906 a hierarchichal data structure in a NoSQL database may well be stored 

907 as a JSON file. 

908 """ 

909 raise NotImplementedError() 

910 

911 @abstractmethod 

912 def remove(self, datasetRef: DatasetRef) -> None: 

913 """Indicate to the Datastore that a Dataset can be removed. 

914 

915 Parameters 

916 ---------- 

917 datasetRef : `DatasetRef` 

918 Reference to the required Dataset. 

919 

920 Raises 

921 ------ 

922 FileNotFoundError 

923 When Dataset does not exist. 

924 

925 Notes 

926 ----- 

927 Some Datastores may implement this method as a silent no-op to 

928 disable Dataset deletion through standard interfaces. 

929 """ 

930 raise NotImplementedError("Must be implemented by subclass") 

931 

932 @abstractmethod 

933 def forget(self, refs: Iterable[DatasetRef]) -> None: 

934 """Indicate to the Datastore that it should remove all records of the 

935 given datasets, without actually deleting them. 

936 

937 Parameters 

938 ---------- 

939 refs : `~collections.abc.Iterable` [ `DatasetRef` ] 

940 References to the datasets being forgotten. 

941 

942 Notes 

943 ----- 

944 Asking a datastore to forget a `DatasetRef` it does not hold should be 

945 a silent no-op, not an error. 

946 """ 

947 raise NotImplementedError("Must be implemented by subclass") 

948 

949 @abstractmethod 

950 def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None: 

951 """Indicate to the Datastore that a Dataset can be moved to the trash. 

952 

953 Parameters 

954 ---------- 

955 ref : `DatasetRef` or iterable thereof 

956 Reference(s) to the required Dataset. 

957 ignore_errors : `bool`, optional 

958 Determine whether errors should be ignored. When multiple 

959 refs are being trashed there will be no per-ref check. 

960 

961 Raises 

962 ------ 

963 FileNotFoundError 

964 When Dataset does not exist and errors are not ignored. Only 

965 checked if a single ref is supplied (and not in a list). 

966 

967 Notes 

968 ----- 

969 Some Datastores may implement this method as a silent no-op to 

970 disable Dataset deletion through standard interfaces. 

971 """ 

972 raise NotImplementedError("Must be implemented by subclass") 

973 

974 @abstractmethod 

975 def emptyTrash(self, ignore_errors: bool = True) -> None: 

976 """Remove all datasets from the trash. 

977 

978 Parameters 

979 ---------- 

980 ignore_errors : `bool`, optional 

981 Determine whether errors should be ignored. 

982 

983 Notes 

984 ----- 

985 Some Datastores may implement this method as a silent no-op to 

986 disable Dataset deletion through standard interfaces. 

987 """ 

988 raise NotImplementedError("Must be implemented by subclass") 

989 

990 @abstractmethod 

991 def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None: 

992 """Transfer a dataset from another datastore to this datastore. 

993 

994 Parameters 

995 ---------- 

996 inputDatastore : `Datastore` 

997 The external `Datastore` from which to retrieve the Dataset. 

998 datasetRef : `DatasetRef` 

999 Reference to the required Dataset. 

1000 """ 

1001 raise NotImplementedError("Must be implemented by subclass") 

1002 

1003 def export( 

1004 self, 

1005 refs: Iterable[DatasetRef], 

1006 *, 

1007 directory: ResourcePathExpression | None = None, 

1008 transfer: str | None = "auto", 

1009 ) -> Iterable[FileDataset]: 

1010 """Export datasets for transfer to another data repository. 

1011 

1012 Parameters 

1013 ---------- 

1014 refs : iterable of `DatasetRef` 

1015 Dataset references to be exported. 

1016 directory : `str`, optional 

1017 Path to a directory that should contain files corresponding to 

1018 output datasets. Ignored if ``transfer`` is explicitly `None`. 

1019 transfer : `str`, optional 

1020 Mode that should be used to move datasets out of the repository. 

1021 Valid options are the same as those of the ``transfer`` argument 

1022 to ``ingest``, and datastores may similarly signal that a transfer 

1023 mode is not supported by raising `NotImplementedError`. If "auto" 

1024 is given and no ``directory`` is specified, `None` will be 

1025 implied. 

1026 

1027 Returns 

1028 ------- 

1029 dataset : iterable of `DatasetTransfer` 

1030 Structs containing information about the exported datasets, in the 

1031 same order as ``refs``. 

1032 

1033 Raises 

1034 ------ 

1035 NotImplementedError 

1036 Raised if the given transfer mode is not supported. 

1037 """ 

1038 raise NotImplementedError(f"Transfer mode {transfer} not supported.") 

1039 

1040 @abstractmethod 

1041 def validateConfiguration( 

1042 self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False 

1043 ) -> None: 

1044 """Validate some of the configuration for this datastore. 

1045 

1046 Parameters 

1047 ---------- 

1048 entities : iterable of `DatasetRef`, `DatasetType`, or `StorageClass` 

1049 Entities to test against this configuration. Can be differing 

1050 types. 

1051 logFailures : `bool`, optional 

1052 If `True`, output a log message for every validation error 

1053 detected. 

1054 

1055 Raises 

1056 ------ 

1057 DatastoreValidationError 

1058 Raised if there is a validation problem with a configuration. 

1059 

1060 Notes 

1061 ----- 

1062 Which parts of the configuration are validated is at the discretion 

1063 of each Datastore implementation. 

1064 """ 

1065 raise NotImplementedError("Must be implemented by subclass") 

1066 

1067 @abstractmethod 

1068 def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None: 

1069 """Validate a specific look up key with supplied entity. 

1070 

1071 Parameters 

1072 ---------- 

1073 lookupKey : `LookupKey` 

1074 Key to use to retrieve information from the datastore 

1075 configuration. 

1076 entity : `DatasetRef`, `DatasetType`, or `StorageClass` 

1077 Entity to compare with configuration retrieved using the 

1078 specified lookup key. 

1079 

1080 Raises 

1081 ------ 

1082 DatastoreValidationError 

1083 Raised if there is a problem with the combination of entity 

1084 and lookup key. 

1085 

1086 Notes 

1087 ----- 

1088 Bypasses the normal selection priorities by allowing a key that 

1089 would normally not be selected to be validated. 

1090 """ 

1091 raise NotImplementedError("Must be implemented by subclass") 

1092 

1093 @abstractmethod 

1094 def getLookupKeys(self) -> set[LookupKey]: 

1095 """Return all the lookup keys relevant to this datastore. 

1096 

1097 Returns 

1098 ------- 

1099 keys : `set` of `LookupKey` 

1100 The keys stored internally for looking up information based 

1101 on `DatasetType` name or `StorageClass`. 

1102 """ 

1103 raise NotImplementedError("Must be implemented by subclass") 

1104 

1105 def needs_expanded_data_ids( 

1106 self, 

1107 transfer: str | None, 

1108 entity: DatasetRef | DatasetType | StorageClass | None = None, 

1109 ) -> bool: 

1110 """Test whether this datastore needs expanded data IDs to ingest. 

1111 

1112 Parameters 

1113 ---------- 

1114 transfer : `str` or `None` 

1115 Transfer mode for ingest. 

1116 entity, optional 

1117 Object representing what will be ingested. If not provided (or not 

1118 specific enough), `True` may be returned even if expanded data 

1119 IDs aren't necessary. 

1120 

1121 Returns 

1122 ------- 

1123 needed : `bool` 

1124 If `True`, expanded data IDs may be needed. `False` only if 

1125 expansion definitely isn't necessary. 

1126 """ 

1127 return True 

1128 

1129 @abstractmethod 

1130 def import_records( 

1131 self, 

1132 data: Mapping[str, DatastoreRecordData], 

1133 ) -> None: 

1134 """Import datastore location and record data from an in-memory data 

1135 structure. 

1136 

1137 Parameters 

1138 ---------- 

1139 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ] 

1140 Datastore records indexed by datastore name. May contain data for 

1141 other `Datastore` instances (generally because they are chained to 

1142 this one), which should be ignored. 

1143 

1144 Notes 

1145 ----- 

1146 Implementations should generally not check that any external resources 

1147 (e.g. files) referred to by these records actually exist, for 

1148 performance reasons; we expect higher-level code to guarantee that they 

1149 do. 

1150 

1151 Implementations are responsible for calling 

1152 `DatastoreRegistryBridge.insert` on all datasets in ``data.locations`` 

1153 where the key is in `names`, as well as loading any opaque table data. 

1154 """ 

1155 raise NotImplementedError() 

1156 

1157 @abstractmethod 

1158 def export_records( 

1159 self, 

1160 refs: Iterable[DatasetIdRef], 

1161 ) -> Mapping[str, DatastoreRecordData]: 

1162 """Export datastore records and locations to an in-memory data 

1163 structure. 

1164 

1165 Parameters 

1166 ---------- 

1167 refs : `~collections.abc.Iterable` [ `DatasetIdRef` ] 

1168 Datasets to save. This may include datasets not known to this 

1169 datastore, which should be ignored. 

1170 

1171 Returns 

1172 ------- 

1173 data : `~collections.abc.Mapping` [ `str`, `DatastoreRecordData` ] 

1174 Exported datastore records indexed by datastore name. 

1175 """ 

1176 raise NotImplementedError() 

1177 

1178 def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: 

1179 """Specify a method that can be used by datastore to retrieve 

1180 registry-defined dataset type. 

1181 

1182 Parameters 

1183 ---------- 

1184 method : `~collections.abc.Callable` | `None` 

1185 Method that takes a name of the dataset type and returns a 

1186 corresponding `DatasetType` instance as defined in Registry. If 

1187 dataset type name is not known to registry `None` is returned. 

1188 

1189 Notes 

1190 ----- 

1191 This method is only needed for a Datastore supporting a "trusted" mode 

1192 when it does not have an access to datastore records and needs to 

1193 guess dataset location based on its stored dataset type. 

1194 """ 

1195 pass