Coverage for python/lsst/daf/butler/_butler.py: 9%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

623 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_class_of 

62from lsst.utils.logging import VERBOSE, getLogger 

63 

64from ._butlerConfig import ButlerConfig 

65from ._butlerRepoIndex import ButlerRepoIndex 

66from ._deferredDatasetHandle import DeferredDatasetHandle 

67from .core import ( 

68 AmbiguousDatasetError, 

69 Config, 

70 ConfigSubset, 

71 DataCoordinate, 

72 DataId, 

73 DataIdValue, 

74 DatasetRef, 

75 DatasetType, 

76 Datastore, 

77 Dimension, 

78 DimensionConfig, 

79 FileDataset, 

80 Progress, 

81 StorageClassFactory, 

82 Timespan, 

83 ValidationError, 

84) 

85from .core.repoRelocation import BUTLER_ROOT_TAG 

86from .core.utils import transactional 

87from .registry import ( 

88 CollectionSearch, 

89 CollectionType, 

90 ConflictingDefinitionError, 

91 DatasetIdGenEnum, 

92 Registry, 

93 RegistryConfig, 

94 RegistryDefaults, 

95) 

96from .transfers import RepoExportContext 

97 

98log = getLogger(__name__) 

99 

100 

101class ButlerValidationError(ValidationError): 

102 """There is a problem with the Butler configuration.""" 

103 

104 pass 

105 

106 

107class PruneCollectionsArgsError(TypeError): 

108 """Base class for errors relating to Butler.pruneCollections input 

109 arguments. 

110 """ 

111 

112 pass 

113 

114 

115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

116 """Raised when purge and unstore are both required to be True, and 

117 purge is True but unstore is False. 

118 """ 

119 

120 def __init__(self) -> None: 

121 super().__init__("Cannot pass purge=True without unstore=True.") 

122 

123 

124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

125 """Raised when pruning a RUN collection but purge is False.""" 

126 

127 def __init__(self, collectionType: CollectionType): 

128 self.collectionType = collectionType 

129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

130 

131 

132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

133 """Raised when purge is True but is not supported for the given 

134 collection.""" 

135 

136 def __init__(self, collectionType: CollectionType): 

137 self.collectionType = collectionType 

138 super().__init__( 

139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

140 ) 

141 

142 

143class Butler: 

144 """Main entry point for the data access system. 

145 

146 Parameters 

147 ---------- 

148 config : `ButlerConfig`, `Config` or `str`, optional. 

149 Configuration. Anything acceptable to the 

150 `ButlerConfig` constructor. If a directory path 

151 is given the configuration will be read from a ``butler.yaml`` file in 

152 that location. If `None` is given default values will be used. 

153 butler : `Butler`, optional. 

154 If provided, construct a new Butler that uses the same registry and 

155 datastore as the given one, but with the given collection and run. 

156 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

157 arguments. 

158 collections : `str` or `Iterable` [ `str` ], optional 

159 An expression specifying the collections to be searched (in order) when 

160 reading datasets. 

161 This may be a `str` collection name or an iterable thereof. 

162 See :ref:`daf_butler_collection_expressions` for more information. 

163 These collections are not registered automatically and must be 

164 manually registered before they are used by any method, but they may be 

165 manually registered after the `Butler` is initialized. 

166 run : `str`, optional 

167 Name of the `~CollectionType.RUN` collection new datasets should be 

168 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

169 ``collections`` will be set to ``[run]``. If not `None`, this 

170 collection will automatically be registered. If this is not set (and 

171 ``writeable`` is not set either), a read-only butler will be created. 

172 searchPaths : `list` of `str`, optional 

173 Directory paths to search when calculating the full Butler 

174 configuration. Not used if the supplied config is already a 

175 `ButlerConfig`. 

176 writeable : `bool`, optional 

177 Explicitly sets whether the butler supports write operations. If not 

178 provided, a read-write butler is created if any of ``run``, ``tags``, 

179 or ``chains`` is non-empty. 

180 inferDefaults : `bool`, optional 

181 If `True` (default) infer default data ID values from the values 

182 present in the datasets in ``collections``: if all collections have the 

183 same value (or no value) for a governor dimension, that value will be 

184 the default for that dimension. Nonexistent collections are ignored. 

185 If a default value is provided explicitly for a governor dimension via 

186 ``**kwargs``, no default will be inferred for that dimension. 

187 **kwargs : `str` 

188 Default data ID key-value pairs. These may only identify "governor" 

189 dimensions like ``instrument`` and ``skymap``. 

190 

191 Examples 

192 -------- 

193 While there are many ways to control exactly how a `Butler` interacts with 

194 the collections in its `Registry`, the most common cases are still simple. 

195 

196 For a read-only `Butler` that searches one collection, do:: 

197 

198 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

199 

200 For a read-write `Butler` that writes to and reads from a 

201 `~CollectionType.RUN` collection:: 

202 

203 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

204 

205 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

206 because we want to write to one `~CollectionType.RUN` collection but read 

207 from several others (as well):: 

208 

209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

210 collections=["u/alice/DM-50000/a", 

211 "u/bob/DM-49998", 

212 "HSC/defaults"]) 

213 

214 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

215 Datasets will be read first from that run (since it appears first in the 

216 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

217 

218 Finally, one can always create a `Butler` with no collections:: 

219 

220 butler = Butler("/path/to/repo", writeable=True) 

221 

222 This can be extremely useful when you just want to use ``butler.registry``, 

223 e.g. for inserting dimension data or managing collections, or when the 

224 collections you want to use with the butler are not consistent. 

225 Passing ``writeable`` explicitly here is only necessary if you want to be 

226 able to make changes to the repo - usually the value for ``writeable`` can 

227 be guessed from the collection arguments provided, but it defaults to 

228 `False` when there are not collection arguments. 

229 """ 

230 

231 def __init__( 

232 self, 

233 config: Union[Config, str, None] = None, 

234 *, 

235 butler: Optional[Butler] = None, 

236 collections: Any = None, 

237 run: Optional[str] = None, 

238 searchPaths: Optional[List[str]] = None, 

239 writeable: Optional[bool] = None, 

240 inferDefaults: bool = True, 

241 **kwargs: str, 

242 ): 

243 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

244 # Load registry, datastore, etc. from config or existing butler. 

245 if butler is not None: 

246 if config is not None or searchPaths is not None or writeable is not None: 

247 raise TypeError( 

248 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

249 ) 

250 self.registry = butler.registry.copy(defaults) 

251 self.datastore = butler.datastore 

252 self.storageClasses = butler.storageClasses 

253 self._config: ButlerConfig = butler._config 

254 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

255 else: 

256 self._config = ButlerConfig(config, searchPaths=searchPaths) 

257 try: 

258 if "root" in self._config: 

259 butlerRoot = self._config["root"] 

260 else: 

261 butlerRoot = self._config.configDir 

262 if writeable is None: 

263 writeable = run is not None 

264 self.registry = Registry.fromConfig( 

265 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

266 ) 

267 self.datastore = Datastore.fromConfig( 

268 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

269 ) 

270 self.storageClasses = StorageClassFactory() 

271 self.storageClasses.addFromConfig(self._config) 

272 self._allow_put_of_predefined_dataset = self._config.get( 

273 "allow_put_of_predefined_dataset", False 

274 ) 

275 except Exception: 

276 # Failures here usually mean that configuration is incomplete, 

277 # just issue an error message which includes config file URI. 

278 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

279 raise 

280 

281 if "run" in self._config or "collection" in self._config: 

282 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

283 

284 GENERATION: ClassVar[int] = 3 

285 """This is a Generation 3 Butler. 

286 

287 This attribute may be removed in the future, once the Generation 2 Butler 

288 interface has been fully retired; it should only be used in transitional 

289 code. 

290 """ 

291 

292 @classmethod 

293 def get_repo_uri(cls, label: str) -> ResourcePath: 

294 """Look up the label in a butler repository index. 

295 

296 Parameters 

297 ---------- 

298 label : `str` 

299 Label of the Butler repository to look up. 

300 

301 Returns 

302 ------- 

303 uri : `lsst.resources.ResourcePath` 

304 URI to the Butler repository associated with the given label. 

305 

306 Raises 

307 ------ 

308 KeyError 

309 Raised if the label is not found in the index, or if an index 

310 can not be found at all. 

311 

312 Notes 

313 ----- 

314 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

315 information is discovered. 

316 """ 

317 return ButlerRepoIndex.get_repo_uri(label) 

318 

319 @classmethod 

320 def get_known_repos(cls) -> Set[str]: 

321 """Retrieve the list of known repository labels. 

322 

323 Returns 

324 ------- 

325 repos : `set` of `str` 

326 All the known labels. Can be empty if no index can be found. 

327 

328 Notes 

329 ----- 

330 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

331 information is discovered. 

332 """ 

333 return ButlerRepoIndex.get_known_repos() 

334 

335 @staticmethod 

336 def makeRepo( 

337 root: ResourcePathExpression, 

338 config: Union[Config, str, None] = None, 

339 dimensionConfig: Union[Config, str, None] = None, 

340 standalone: bool = False, 

341 searchPaths: Optional[List[str]] = None, 

342 forceConfigRoot: bool = True, 

343 outfile: Optional[ResourcePathExpression] = None, 

344 overwrite: bool = False, 

345 ) -> Config: 

346 """Create an empty data repository by adding a butler.yaml config 

347 to a repository root directory. 

348 

349 Parameters 

350 ---------- 

351 root : `lsst.resources.ResourcePathExpression` 

352 Path or URI to the root location of the new repository. Will be 

353 created if it does not exist. 

354 config : `Config` or `str`, optional 

355 Configuration to write to the repository, after setting any 

356 root-dependent Registry or Datastore config options. Can not 

357 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

358 configuration will be used. Root-dependent config options 

359 specified in this config are overwritten if ``forceConfigRoot`` 

360 is `True`. 

361 dimensionConfig : `Config` or `str`, optional 

362 Configuration for dimensions, will be used to initialize registry 

363 database. 

364 standalone : `bool` 

365 If True, write all expanded defaults, not just customized or 

366 repository-specific settings. 

367 This (mostly) decouples the repository from the default 

368 configuration, insulating it from changes to the defaults (which 

369 may be good or bad, depending on the nature of the changes). 

370 Future *additions* to the defaults will still be picked up when 

371 initializing `Butlers` to repos created with ``standalone=True``. 

372 searchPaths : `list` of `str`, optional 

373 Directory paths to search when calculating the full butler 

374 configuration. 

375 forceConfigRoot : `bool`, optional 

376 If `False`, any values present in the supplied ``config`` that 

377 would normally be reset are not overridden and will appear 

378 directly in the output config. This allows non-standard overrides 

379 of the root directory for a datastore or registry to be given. 

380 If this parameter is `True` the values for ``root`` will be 

381 forced into the resulting config if appropriate. 

382 outfile : `lss.resources.ResourcePathExpression`, optional 

383 If not-`None`, the output configuration will be written to this 

384 location rather than into the repository itself. Can be a URI 

385 string. Can refer to a directory that will be used to write 

386 ``butler.yaml``. 

387 overwrite : `bool`, optional 

388 Create a new configuration file even if one already exists 

389 in the specified output location. Default is to raise 

390 an exception. 

391 

392 Returns 

393 ------- 

394 config : `Config` 

395 The updated `Config` instance written to the repo. 

396 

397 Raises 

398 ------ 

399 ValueError 

400 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

401 regular Config (as these subclasses would make it impossible to 

402 support ``standalone=False``). 

403 FileExistsError 

404 Raised if the output config file already exists. 

405 os.error 

406 Raised if the directory does not exist, exists but is not a 

407 directory, or cannot be created. 

408 

409 Notes 

410 ----- 

411 Note that when ``standalone=False`` (the default), the configuration 

412 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

413 construct the repository should also be used to construct any Butlers 

414 to avoid configuration inconsistencies. 

415 """ 

416 if isinstance(config, (ButlerConfig, ConfigSubset)): 

417 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

418 

419 # Ensure that the root of the repository exists or can be made 

420 root_uri = ResourcePath(root, forceDirectory=True) 

421 root_uri.mkdir() 

422 

423 config = Config(config) 

424 

425 # If we are creating a new repo from scratch with relative roots, 

426 # do not propagate an explicit root from the config file 

427 if "root" in config: 

428 del config["root"] 

429 

430 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

431 imported_class = doImportType(full["datastore", "cls"]) 

432 if not issubclass(imported_class, Datastore): 

433 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

434 datastoreClass: Type[Datastore] = imported_class 

435 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

436 

437 # if key exists in given config, parse it, otherwise parse the defaults 

438 # in the expanded config 

439 if config.get(("registry", "db")): 

440 registryConfig = RegistryConfig(config) 

441 else: 

442 registryConfig = RegistryConfig(full) 

443 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

444 if defaultDatabaseUri is not None: 

445 Config.updateParameters( 

446 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

447 ) 

448 else: 

449 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

450 

451 if standalone: 

452 config.merge(full) 

453 else: 

454 # Always expand the registry.managers section into the per-repo 

455 # config, because after the database schema is created, it's not 

456 # allowed to change anymore. Note that in the standalone=True 

457 # branch, _everything_ in the config is expanded, so there's no 

458 # need to special case this. 

459 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

460 configURI: ResourcePathExpression 

461 if outfile is not None: 

462 # When writing to a separate location we must include 

463 # the root of the butler repo in the config else it won't know 

464 # where to look. 

465 config["root"] = root_uri.geturl() 

466 configURI = outfile 

467 else: 

468 configURI = root_uri 

469 config.dumpToUri(configURI, overwrite=overwrite) 

470 

471 # Create Registry and populate tables 

472 registryConfig = RegistryConfig(config.get("registry")) 

473 dimensionConfig = DimensionConfig(dimensionConfig) 

474 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

475 

476 log.verbose("Wrote new Butler configuration file to %s", configURI) 

477 

478 return config 

479 

480 @classmethod 

481 def _unpickle( 

482 cls, 

483 config: ButlerConfig, 

484 collections: Optional[CollectionSearch], 

485 run: Optional[str], 

486 defaultDataId: Dict[str, str], 

487 writeable: bool, 

488 ) -> Butler: 

489 """Callable used to unpickle a Butler. 

490 

491 We prefer not to use ``Butler.__init__`` directly so we can force some 

492 of its many arguments to be keyword-only (note that ``__reduce__`` 

493 can only invoke callables with positional arguments). 

494 

495 Parameters 

496 ---------- 

497 config : `ButlerConfig` 

498 Butler configuration, already coerced into a true `ButlerConfig` 

499 instance (and hence after any search paths for overrides have been 

500 utilized). 

501 collections : `CollectionSearch` 

502 Names of the default collections to read from. 

503 run : `str`, optional 

504 Name of the default `~CollectionType.RUN` collection to write to. 

505 defaultDataId : `dict` [ `str`, `str` ] 

506 Default data ID values. 

507 writeable : `bool` 

508 Whether the Butler should support write operations. 

509 

510 Returns 

511 ------- 

512 butler : `Butler` 

513 A new `Butler` instance. 

514 """ 

515 # MyPy doesn't recognize that the kwargs below are totally valid; it 

516 # seems to think '**defaultDataId* is a _positional_ argument! 

517 return cls( 

518 config=config, 

519 collections=collections, 

520 run=run, 

521 writeable=writeable, 

522 **defaultDataId, # type: ignore 

523 ) 

524 

525 def __reduce__(self) -> tuple: 

526 """Support pickling.""" 

527 return ( 

528 Butler._unpickle, 

529 ( 

530 self._config, 

531 self.collections, 

532 self.run, 

533 self.registry.defaults.dataId.byName(), 

534 self.registry.isWriteable(), 

535 ), 

536 ) 

537 

538 def __str__(self) -> str: 

539 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

540 self.collections, self.run, self.datastore, self.registry 

541 ) 

542 

543 def isWriteable(self) -> bool: 

544 """Return `True` if this `Butler` supports write operations.""" 

545 return self.registry.isWriteable() 

546 

547 @contextlib.contextmanager 

548 def transaction(self) -> Iterator[None]: 

549 """Context manager supporting `Butler` transactions. 

550 

551 Transactions can be nested. 

552 """ 

553 with self.registry.transaction(): 

554 with self.datastore.transaction(): 

555 yield 

556 

557 def _standardizeArgs( 

558 self, 

559 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

560 dataId: Optional[DataId] = None, 

561 for_put: bool = True, 

562 **kwargs: Any, 

563 ) -> Tuple[DatasetType, Optional[DataId]]: 

564 """Standardize the arguments passed to several Butler APIs. 

565 

566 Parameters 

567 ---------- 

568 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

569 When `DatasetRef` the `dataId` should be `None`. 

570 Otherwise the `DatasetType` or name thereof. 

571 dataId : `dict` or `DataCoordinate` 

572 A `dict` of `Dimension` link name, value pairs that label the 

573 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

574 should be provided as the second argument. 

575 for_put : `bool`, optional 

576 If `True` this call is invoked as part of a `Butler.put()`. 

577 Otherwise it is assumed to be part of a `Butler.get()`. This 

578 parameter is only relevant if there is dataset type 

579 inconsistency. 

580 **kwargs 

581 Additional keyword arguments used to augment or construct a 

582 `DataCoordinate`. See `DataCoordinate.standardize` 

583 parameters. 

584 

585 Returns 

586 ------- 

587 datasetType : `DatasetType` 

588 A `DatasetType` instance extracted from ``datasetRefOrType``. 

589 dataId : `dict` or `DataId`, optional 

590 Argument that can be used (along with ``kwargs``) to construct a 

591 `DataId`. 

592 

593 Notes 

594 ----- 

595 Butler APIs that conceptually need a DatasetRef also allow passing a 

596 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

597 keyword arguments that can be used to construct one) separately. This 

598 method accepts those arguments and always returns a true `DatasetType` 

599 and a `DataId` or `dict`. 

600 

601 Standardization of `dict` vs `DataId` is best handled by passing the 

602 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

603 generally similarly flexible. 

604 """ 

605 externalDatasetType: Optional[DatasetType] = None 

606 internalDatasetType: Optional[DatasetType] = None 

607 if isinstance(datasetRefOrType, DatasetRef): 

608 if dataId is not None or kwargs: 

609 raise ValueError("DatasetRef given, cannot use dataId as well") 

610 externalDatasetType = datasetRefOrType.datasetType 

611 dataId = datasetRefOrType.dataId 

612 else: 

613 # Don't check whether DataId is provided, because Registry APIs 

614 # can usually construct a better error message when it wasn't. 

615 if isinstance(datasetRefOrType, DatasetType): 

616 externalDatasetType = datasetRefOrType 

617 else: 

618 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

619 

620 # Check that they are self-consistent 

621 if externalDatasetType is not None: 

622 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

623 if externalDatasetType != internalDatasetType: 

624 # We can allow differences if they are compatible, depending 

625 # on whether this is a get or a put. A get requires that 

626 # the python type associated with the datastore can be 

627 # converted to the user type. A put requires that the user 

628 # supplied python type can be converted to the internal 

629 # type expected by registry. 

630 relevantDatasetType = internalDatasetType 

631 if for_put: 

632 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

633 else: 

634 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

635 relevantDatasetType = externalDatasetType 

636 if not is_compatible: 

637 raise ValueError( 

638 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

639 f"registry definition ({internalDatasetType})" 

640 ) 

641 # Override the internal definition. 

642 internalDatasetType = relevantDatasetType 

643 

644 assert internalDatasetType is not None 

645 return internalDatasetType, dataId 

646 

647 def _rewrite_data_id( 

648 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

649 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

650 """Rewrite a data ID taking into account dimension records. 

651 

652 Take a Data ID and keyword args and rewrite it if necessary to 

653 allow the user to specify dimension records rather than dimension 

654 primary values. 

655 

656 This allows a user to include a dataId dict with keys of 

657 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

658 the integer exposure ID. It also allows a string to be given 

659 for a dimension value rather than the integer ID if that is more 

660 convenient. For example, rather than having to specifyin the 

661 detector with ``detector.full_name``, a string given for ``detector`` 

662 will be interpreted as the full name and converted to the integer 

663 value. 

664 

665 Keyword arguments can also use strings for dimensions like detector 

666 and exposure but python does not allow them to include ``.`` and 

667 so the ``exposure.day_obs`` syntax can not be used in a keyword 

668 argument. 

669 

670 Parameters 

671 ---------- 

672 dataId : `dict` or `DataCoordinate` 

673 A `dict` of `Dimension` link name, value pairs that will label the 

674 `DatasetRef` within a Collection. 

675 datasetType : `DatasetType` 

676 The dataset type associated with this dataId. Required to 

677 determine the relevant dimensions. 

678 **kwargs 

679 Additional keyword arguments used to augment or construct a 

680 `DataId`. See `DataId` parameters. 

681 

682 Returns 

683 ------- 

684 dataId : `dict` or `DataCoordinate` 

685 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

686 no keyword arguments, the original dataId will be returned 

687 unchanged. 

688 **kwargs : `dict` 

689 Any unused keyword arguments (would normally be empty dict). 

690 """ 

691 # Do nothing if we have a standalone DataCoordinate. 

692 if isinstance(dataId, DataCoordinate) and not kwargs: 

693 return dataId, kwargs 

694 

695 # Process dimension records that are using record information 

696 # rather than ids 

697 newDataId: Dict[str, DataIdValue] = {} 

698 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

699 

700 # if all the dataId comes from keyword parameters we do not need 

701 # to do anything here because they can't be of the form 

702 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

703 if dataId: 

704 for k, v in dataId.items(): 

705 # If we have a Dimension we do not need to do anything 

706 # because it cannot be a compound key. 

707 if isinstance(k, str) and "." in k: 

708 # Someone is using a more human-readable dataId 

709 dimensionName, record = k.split(".", 1) 

710 byRecord[dimensionName][record] = v 

711 elif isinstance(k, Dimension): 

712 newDataId[k.name] = v 

713 else: 

714 newDataId[k] = v 

715 

716 # Go through the updated dataId and check the type in case someone is 

717 # using an alternate key. We have already filtered out the compound 

718 # keys dimensions.record format. 

719 not_dimensions = {} 

720 

721 # Will need to look in the dataId and the keyword arguments 

722 # and will remove them if they need to be fixed or are unrecognized. 

723 for dataIdDict in (newDataId, kwargs): 

724 # Use a list so we can adjust the dict safely in the loop 

725 for dimensionName in list(dataIdDict): 

726 value = dataIdDict[dimensionName] 

727 try: 

728 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

729 except KeyError: 

730 # This is not a real dimension 

731 not_dimensions[dimensionName] = value 

732 del dataIdDict[dimensionName] 

733 continue 

734 

735 # Convert an integral type to an explicit int to simplify 

736 # comparisons here 

737 if isinstance(value, numbers.Integral): 

738 value = int(value) 

739 

740 if not isinstance(value, dimension.primaryKey.getPythonType()): 

741 for alternate in dimension.alternateKeys: 

742 if isinstance(value, alternate.getPythonType()): 

743 byRecord[dimensionName][alternate.name] = value 

744 del dataIdDict[dimensionName] 

745 log.debug( 

746 "Converting dimension %s to %s.%s=%s", 

747 dimensionName, 

748 dimensionName, 

749 alternate.name, 

750 value, 

751 ) 

752 break 

753 else: 

754 log.warning( 

755 "Type mismatch found for value '%r' provided for dimension %s. " 

756 "Could not find matching alternative (primary key has type %s) " 

757 "so attempting to use as-is.", 

758 value, 

759 dimensionName, 

760 dimension.primaryKey.getPythonType(), 

761 ) 

762 

763 # By this point kwargs and newDataId should only include valid 

764 # dimensions. Merge kwargs in to the new dataId and log if there 

765 # are dimensions in both (rather than calling update). 

766 for k, v in kwargs.items(): 

767 if k in newDataId and newDataId[k] != v: 

768 log.debug( 

769 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

770 ) 

771 newDataId[k] = v 

772 # No need to retain any values in kwargs now. 

773 kwargs = {} 

774 

775 # If we have some unrecognized dimensions we have to try to connect 

776 # them to records in other dimensions. This is made more complicated 

777 # by some dimensions having records with clashing names. A mitigation 

778 # is that we can tell by this point which dimensions are missing 

779 # for the DatasetType but this does not work for calibrations 

780 # where additional dimensions can be used to constrain the temporal 

781 # axis. 

782 if not_dimensions: 

783 # Search for all dimensions even if we have been given a value 

784 # explicitly. In some cases records are given as well as the 

785 # actually dimension and this should not be an error if they 

786 # match. 

787 mandatoryDimensions = datasetType.dimensions.names # - provided 

788 

789 candidateDimensions: Set[str] = set() 

790 candidateDimensions.update(mandatoryDimensions) 

791 

792 # For calibrations we may well be needing temporal dimensions 

793 # so rather than always including all dimensions in the scan 

794 # restrict things a little. It is still possible for there 

795 # to be confusion over day_obs in visit vs exposure for example. 

796 # If we are not searching calibration collections things may 

797 # fail but they are going to fail anyway because of the 

798 # ambiguousness of the dataId... 

799 if datasetType.isCalibration(): 

800 for dim in self.registry.dimensions.getStaticDimensions(): 

801 if dim.temporal: 

802 candidateDimensions.add(str(dim)) 

803 

804 # Look up table for the first association with a dimension 

805 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

806 

807 # Keep track of whether an item is associated with multiple 

808 # dimensions. 

809 counter: Counter[str] = Counter() 

810 assigned: Dict[str, Set[str]] = defaultdict(set) 

811 

812 # Go through the missing dimensions and associate the 

813 # given names with records within those dimensions 

814 matched_dims = set() 

815 for dimensionName in candidateDimensions: 

816 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

817 fields = dimension.metadata.names | dimension.uniqueKeys.names 

818 for field in not_dimensions: 

819 if field in fields: 

820 guessedAssociation[dimensionName][field] = not_dimensions[field] 

821 counter[dimensionName] += 1 

822 assigned[field].add(dimensionName) 

823 matched_dims.add(field) 

824 

825 # Calculate the fields that matched nothing. 

826 never_found = set(not_dimensions) - matched_dims 

827 

828 if never_found: 

829 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

830 

831 # There is a chance we have allocated a single dataId item 

832 # to multiple dimensions. Need to decide which should be retained. 

833 # For now assume that the most popular alternative wins. 

834 # This means that day_obs with seq_num will result in 

835 # exposure.day_obs and not visit.day_obs 

836 # Also prefer an explicitly missing dimension over an inferred 

837 # temporal dimension. 

838 for fieldName, assignedDimensions in assigned.items(): 

839 if len(assignedDimensions) > 1: 

840 # Pick the most popular (preferring mandatory dimensions) 

841 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

842 if requiredButMissing: 

843 candidateDimensions = requiredButMissing 

844 else: 

845 candidateDimensions = assignedDimensions 

846 

847 # Select the relevant items and get a new restricted 

848 # counter. 

849 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

850 duplicatesCounter: Counter[str] = Counter() 

851 duplicatesCounter.update(theseCounts) 

852 

853 # Choose the most common. If they are equally common 

854 # we will pick the one that was found first. 

855 # Returns a list of tuples 

856 selected = duplicatesCounter.most_common(1)[0][0] 

857 

858 log.debug( 

859 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

860 " Removed ambiguity by choosing dimension %s.", 

861 fieldName, 

862 ", ".join(assignedDimensions), 

863 selected, 

864 ) 

865 

866 for candidateDimension in assignedDimensions: 

867 if candidateDimension != selected: 

868 del guessedAssociation[candidateDimension][fieldName] 

869 

870 # Update the record look up dict with the new associations 

871 for dimensionName, values in guessedAssociation.items(): 

872 if values: # A dict might now be empty 

873 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

874 byRecord[dimensionName].update(values) 

875 

876 if byRecord: 

877 # Some record specifiers were found so we need to convert 

878 # them to the Id form 

879 for dimensionName, values in byRecord.items(): 

880 if dimensionName in newDataId: 

881 log.debug( 

882 "DataId specified explicit %s dimension value of %s in addition to" 

883 " general record specifiers for it of %s. Ignoring record information.", 

884 dimensionName, 

885 newDataId[dimensionName], 

886 str(values), 

887 ) 

888 # Get the actual record and compare with these values. 

889 try: 

890 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

891 except LookupError: 

892 raise ValueError( 

893 f"Could not find dimension '{dimensionName}'" 

894 f" with dataId {newDataId} as part of comparing with" 

895 f" record values {byRecord[dimensionName]}" 

896 ) from None 

897 if len(recs) == 1: 

898 errmsg: List[str] = [] 

899 for k, v in values.items(): 

900 if (recval := getattr(recs[0], k)) != v: 

901 errmsg.append(f"{k}({recval} != {v})") 

902 if errmsg: 

903 raise ValueError( 

904 f"Dimension {dimensionName} in dataId has explicit value" 

905 " inconsistent with records: " + ", ".join(errmsg) 

906 ) 

907 else: 

908 # Multiple matches for an explicit dimension 

909 # should never happen but let downstream complain. 

910 pass 

911 continue 

912 

913 # Build up a WHERE expression 

914 bind = {k: v for k, v in values.items()} 

915 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

916 

917 # Hopefully we get a single record that matches 

918 records = set( 

919 self.registry.queryDimensionRecords( 

920 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

921 ) 

922 ) 

923 

924 if len(records) != 1: 

925 if len(records) > 1: 

926 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

927 for r in records: 

928 log.debug("- %s", str(r)) 

929 raise ValueError( 

930 f"DataId specification for dimension {dimensionName} is not" 

931 f" uniquely constrained to a single dataset by {values}." 

932 f" Got {len(records)} results." 

933 ) 

934 raise ValueError( 

935 f"DataId specification for dimension {dimensionName} matched no" 

936 f" records when constrained by {values}" 

937 ) 

938 

939 # Get the primary key from the real dimension object 

940 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

941 if not isinstance(dimension, Dimension): 

942 raise RuntimeError( 

943 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

944 ) 

945 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

946 

947 return newDataId, kwargs 

948 

949 def _findDatasetRef( 

950 self, 

951 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

952 dataId: Optional[DataId] = None, 

953 *, 

954 collections: Any = None, 

955 allowUnresolved: bool = False, 

956 **kwargs: Any, 

957 ) -> DatasetRef: 

958 """Shared logic for methods that start with a search for a dataset in 

959 the registry. 

960 

961 Parameters 

962 ---------- 

963 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

964 When `DatasetRef` the `dataId` should be `None`. 

965 Otherwise the `DatasetType` or name thereof. 

966 dataId : `dict` or `DataCoordinate`, optional 

967 A `dict` of `Dimension` link name, value pairs that label the 

968 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

969 should be provided as the first argument. 

970 collections : Any, optional 

971 Collections to be searched, overriding ``self.collections``. 

972 Can be any of the types supported by the ``collections`` argument 

973 to butler construction. 

974 allowUnresolved : `bool`, optional 

975 If `True`, return an unresolved `DatasetRef` if finding a resolved 

976 one in the `Registry` fails. Defaults to `False`. 

977 **kwargs 

978 Additional keyword arguments used to augment or construct a 

979 `DataId`. See `DataId` parameters. 

980 

981 Returns 

982 ------- 

983 ref : `DatasetRef` 

984 A reference to the dataset identified by the given arguments. 

985 

986 Raises 

987 ------ 

988 LookupError 

989 Raised if no matching dataset exists in the `Registry` (and 

990 ``allowUnresolved is False``). 

991 ValueError 

992 Raised if a resolved `DatasetRef` was passed as an input, but it 

993 differs from the one found in the registry. 

994 TypeError 

995 Raised if no collections were provided. 

996 """ 

997 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

998 if isinstance(datasetRefOrType, DatasetRef): 

999 idNumber = datasetRefOrType.id 

1000 else: 

1001 idNumber = None 

1002 timespan: Optional[Timespan] = None 

1003 

1004 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1005 

1006 if datasetType.isCalibration(): 

1007 # Because this is a calibration dataset, first try to make a 

1008 # standardize the data ID without restricting the dimensions to 

1009 # those of the dataset type requested, because there may be extra 

1010 # dimensions that provide temporal information for a validity-range 

1011 # lookup. 

1012 dataId = DataCoordinate.standardize( 

1013 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1014 ) 

1015 if dataId.graph.temporal: 

1016 dataId = self.registry.expandDataId(dataId) 

1017 timespan = dataId.timespan 

1018 else: 

1019 # Standardize the data ID to just the dimensions of the dataset 

1020 # type instead of letting registry.findDataset do it, so we get the 

1021 # result even if no dataset is found. 

1022 dataId = DataCoordinate.standardize( 

1023 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1024 ) 

1025 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1026 # present in the current collection. 

1027 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1028 if ref is None: 

1029 if allowUnresolved: 

1030 return DatasetRef(datasetType, dataId) 

1031 else: 

1032 if collections is None: 

1033 collections = self.registry.defaults.collections 

1034 raise LookupError( 

1035 f"Dataset {datasetType.name} with data ID {dataId} " 

1036 f"could not be found in collections {collections}." 

1037 ) 

1038 if idNumber is not None and idNumber != ref.id: 

1039 if collections is None: 

1040 collections = self.registry.defaults.collections 

1041 raise ValueError( 

1042 f"DatasetRef.id provided ({idNumber}) does not match " 

1043 f"id ({ref.id}) in registry in collections {collections}." 

1044 ) 

1045 if datasetType != ref.datasetType: 

1046 # If they differ it is because the user explicitly specified 

1047 # a compatible dataset type to this call rather than using the 

1048 # registry definition. The DatasetRef must therefore be recreated 

1049 # using the user definition such that the expected type is 

1050 # returned. 

1051 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1052 

1053 return ref 

1054 

1055 @transactional 

1056 def put( 

1057 self, 

1058 obj: Any, 

1059 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1060 dataId: Optional[DataId] = None, 

1061 *, 

1062 run: Optional[str] = None, 

1063 **kwargs: Any, 

1064 ) -> DatasetRef: 

1065 """Store and register a dataset. 

1066 

1067 Parameters 

1068 ---------- 

1069 obj : `object` 

1070 The dataset. 

1071 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1072 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1073 Otherwise the `DatasetType` or name thereof. 

1074 dataId : `dict` or `DataCoordinate` 

1075 A `dict` of `Dimension` link name, value pairs that label the 

1076 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1077 should be provided as the second argument. 

1078 run : `str`, optional 

1079 The name of the run the dataset should be added to, overriding 

1080 ``self.run``. 

1081 **kwargs 

1082 Additional keyword arguments used to augment or construct a 

1083 `DataCoordinate`. See `DataCoordinate.standardize` 

1084 parameters. 

1085 

1086 Returns 

1087 ------- 

1088 ref : `DatasetRef` 

1089 A reference to the stored dataset, updated with the correct id if 

1090 given. 

1091 

1092 Raises 

1093 ------ 

1094 TypeError 

1095 Raised if the butler is read-only or if no run has been provided. 

1096 """ 

1097 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1098 if not self.isWriteable(): 

1099 raise TypeError("Butler is read-only.") 

1100 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1101 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1102 raise ValueError("DatasetRef must not be in registry, must have None id") 

1103 

1104 # Handle dimension records in dataId 

1105 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1106 

1107 # Add Registry Dataset entry. 

1108 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1109 

1110 # For an execution butler the datasets will be pre-defined. 

1111 # If the butler is configured that way datasets should only be inserted 

1112 # if they do not already exist in registry. Trying and catching 

1113 # ConflictingDefinitionError will not work because the transaction 

1114 # will be corrupted. Instead, in this mode always check first. 

1115 ref = None 

1116 ref_is_predefined = False 

1117 if self._allow_put_of_predefined_dataset: 

1118 # Get the matching ref for this run. 

1119 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1120 

1121 if ref: 

1122 # Must be expanded form for datastore templating 

1123 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1124 ref = ref.expanded(dataId) 

1125 ref_is_predefined = True 

1126 

1127 if not ref: 

1128 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1129 

1130 # If the ref is predefined it is possible that the datastore also 

1131 # has the record. Asking datastore to put it again will result in 

1132 # the artifact being recreated, overwriting previous, then will cause 

1133 # a failure in writing the record which will cause the artifact 

1134 # to be removed. Much safer to ask first before attempting to 

1135 # overwrite. Race conditions should not be an issue for the 

1136 # execution butler environment. 

1137 if ref_is_predefined: 

1138 if self.datastore.knows(ref): 

1139 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1140 

1141 self.datastore.put(obj, ref) 

1142 

1143 return ref 

1144 

1145 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1146 """Retrieve a stored dataset. 

1147 

1148 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1149 collection to be read as long as the `DatasetRef` that identifies them 

1150 can be obtained separately. 

1151 

1152 Parameters 

1153 ---------- 

1154 ref : `DatasetRef` 

1155 Resolved reference to an already stored dataset. 

1156 parameters : `dict` 

1157 Additional StorageClass-defined options to control reading, 

1158 typically used to efficiently read only a subset of the dataset. 

1159 

1160 Returns 

1161 ------- 

1162 obj : `object` 

1163 The dataset. 

1164 """ 

1165 return self.datastore.get(ref, parameters=parameters) 

1166 

1167 def getDirectDeferred( 

1168 self, ref: DatasetRef, *, parameters: Union[dict, None] = None 

1169 ) -> DeferredDatasetHandle: 

1170 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1171 from a resolved `DatasetRef`. 

1172 

1173 Parameters 

1174 ---------- 

1175 ref : `DatasetRef` 

1176 Resolved reference to an already stored dataset. 

1177 parameters : `dict` 

1178 Additional StorageClass-defined options to control reading, 

1179 typically used to efficiently read only a subset of the dataset. 

1180 

1181 Returns 

1182 ------- 

1183 obj : `DeferredDatasetHandle` 

1184 A handle which can be used to retrieve a dataset at a later time. 

1185 

1186 Raises 

1187 ------ 

1188 AmbiguousDatasetError 

1189 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1190 """ 

1191 if ref.id is None: 

1192 raise AmbiguousDatasetError( 

1193 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1194 ) 

1195 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1196 

1197 def getDeferred( 

1198 self, 

1199 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1200 dataId: Optional[DataId] = None, 

1201 *, 

1202 parameters: Union[dict, None] = None, 

1203 collections: Any = None, 

1204 **kwargs: Any, 

1205 ) -> DeferredDatasetHandle: 

1206 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1207 after an immediate registry lookup. 

1208 

1209 Parameters 

1210 ---------- 

1211 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1212 When `DatasetRef` the `dataId` should be `None`. 

1213 Otherwise the `DatasetType` or name thereof. 

1214 dataId : `dict` or `DataCoordinate`, optional 

1215 A `dict` of `Dimension` link name, value pairs that label the 

1216 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1217 should be provided as the first argument. 

1218 parameters : `dict` 

1219 Additional StorageClass-defined options to control reading, 

1220 typically used to efficiently read only a subset of the dataset. 

1221 collections : Any, optional 

1222 Collections to be searched, overriding ``self.collections``. 

1223 Can be any of the types supported by the ``collections`` argument 

1224 to butler construction. 

1225 **kwargs 

1226 Additional keyword arguments used to augment or construct a 

1227 `DataId`. See `DataId` parameters. 

1228 

1229 Returns 

1230 ------- 

1231 obj : `DeferredDatasetHandle` 

1232 A handle which can be used to retrieve a dataset at a later time. 

1233 

1234 Raises 

1235 ------ 

1236 LookupError 

1237 Raised if no matching dataset exists in the `Registry` (and 

1238 ``allowUnresolved is False``). 

1239 ValueError 

1240 Raised if a resolved `DatasetRef` was passed as an input, but it 

1241 differs from the one found in the registry. 

1242 TypeError 

1243 Raised if no collections were provided. 

1244 """ 

1245 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1246 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1247 

1248 def get( 

1249 self, 

1250 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1251 dataId: Optional[DataId] = None, 

1252 *, 

1253 parameters: Optional[Dict[str, Any]] = None, 

1254 collections: Any = None, 

1255 **kwargs: Any, 

1256 ) -> Any: 

1257 """Retrieve a stored dataset. 

1258 

1259 Parameters 

1260 ---------- 

1261 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1262 When `DatasetRef` the `dataId` should be `None`. 

1263 Otherwise the `DatasetType` or name thereof. 

1264 dataId : `dict` or `DataCoordinate` 

1265 A `dict` of `Dimension` link name, value pairs that label the 

1266 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1267 should be provided as the first argument. 

1268 parameters : `dict` 

1269 Additional StorageClass-defined options to control reading, 

1270 typically used to efficiently read only a subset of the dataset. 

1271 collections : Any, optional 

1272 Collections to be searched, overriding ``self.collections``. 

1273 Can be any of the types supported by the ``collections`` argument 

1274 to butler construction. 

1275 **kwargs 

1276 Additional keyword arguments used to augment or construct a 

1277 `DataCoordinate`. See `DataCoordinate.standardize` 

1278 parameters. 

1279 

1280 Returns 

1281 ------- 

1282 obj : `object` 

1283 The dataset. 

1284 

1285 Raises 

1286 ------ 

1287 ValueError 

1288 Raised if a resolved `DatasetRef` was passed as an input, but it 

1289 differs from the one found in the registry. 

1290 LookupError 

1291 Raised if no matching dataset exists in the `Registry`. 

1292 TypeError 

1293 Raised if no collections were provided. 

1294 

1295 Notes 

1296 ----- 

1297 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1298 this method requires that the given data ID include temporal dimensions 

1299 beyond the dimensions of the dataset type itself, in order to find the 

1300 dataset with the appropriate validity range. For example, a "bias" 

1301 dataset with native dimensions ``{instrument, detector}`` could be 

1302 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1303 ``exposure`` is a temporal dimension. 

1304 """ 

1305 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1306 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1307 return self.getDirect(ref, parameters=parameters) 

1308 

1309 def getURIs( 

1310 self, 

1311 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1312 dataId: Optional[DataId] = None, 

1313 *, 

1314 predict: bool = False, 

1315 collections: Any = None, 

1316 run: Optional[str] = None, 

1317 **kwargs: Any, 

1318 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1319 """Returns the URIs associated with the dataset. 

1320 

1321 Parameters 

1322 ---------- 

1323 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1324 When `DatasetRef` the `dataId` should be `None`. 

1325 Otherwise the `DatasetType` or name thereof. 

1326 dataId : `dict` or `DataCoordinate` 

1327 A `dict` of `Dimension` link name, value pairs that label the 

1328 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1329 should be provided as the first argument. 

1330 predict : `bool` 

1331 If `True`, allow URIs to be returned of datasets that have not 

1332 been written. 

1333 collections : Any, optional 

1334 Collections to be searched, overriding ``self.collections``. 

1335 Can be any of the types supported by the ``collections`` argument 

1336 to butler construction. 

1337 run : `str`, optional 

1338 Run to use for predictions, overriding ``self.run``. 

1339 **kwargs 

1340 Additional keyword arguments used to augment or construct a 

1341 `DataCoordinate`. See `DataCoordinate.standardize` 

1342 parameters. 

1343 

1344 Returns 

1345 ------- 

1346 primary : `lsst.resources.ResourcePath` 

1347 The URI to the primary artifact associated with this dataset. 

1348 If the dataset was disassembled within the datastore this 

1349 may be `None`. 

1350 components : `dict` 

1351 URIs to any components associated with the dataset artifact. 

1352 Can be empty if there are no components. 

1353 """ 

1354 ref = self._findDatasetRef( 

1355 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1356 ) 

1357 if ref.id is None: # only possible if predict is True 

1358 if run is None: 

1359 run = self.run 

1360 if run is None: 

1361 raise TypeError("Cannot predict location with run=None.") 

1362 # Lie about ID, because we can't guess it, and only 

1363 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1364 ref = ref.resolved(id=0, run=run) 

1365 return self.datastore.getURIs(ref, predict) 

1366 

1367 def getURI( 

1368 self, 

1369 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1370 dataId: Optional[DataId] = None, 

1371 *, 

1372 predict: bool = False, 

1373 collections: Any = None, 

1374 run: Optional[str] = None, 

1375 **kwargs: Any, 

1376 ) -> ResourcePath: 

1377 """Return the URI to the Dataset. 

1378 

1379 Parameters 

1380 ---------- 

1381 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1382 When `DatasetRef` the `dataId` should be `None`. 

1383 Otherwise the `DatasetType` or name thereof. 

1384 dataId : `dict` or `DataCoordinate` 

1385 A `dict` of `Dimension` link name, value pairs that label the 

1386 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1387 should be provided as the first argument. 

1388 predict : `bool` 

1389 If `True`, allow URIs to be returned of datasets that have not 

1390 been written. 

1391 collections : Any, optional 

1392 Collections to be searched, overriding ``self.collections``. 

1393 Can be any of the types supported by the ``collections`` argument 

1394 to butler construction. 

1395 run : `str`, optional 

1396 Run to use for predictions, overriding ``self.run``. 

1397 **kwargs 

1398 Additional keyword arguments used to augment or construct a 

1399 `DataCoordinate`. See `DataCoordinate.standardize` 

1400 parameters. 

1401 

1402 Returns 

1403 ------- 

1404 uri : `lsst.resources.ResourcePath` 

1405 URI pointing to the Dataset within the datastore. If the 

1406 Dataset does not exist in the datastore, and if ``predict`` is 

1407 `True`, the URI will be a prediction and will include a URI 

1408 fragment "#predicted". 

1409 If the datastore does not have entities that relate well 

1410 to the concept of a URI the returned URI string will be 

1411 descriptive. The returned URI is not guaranteed to be obtainable. 

1412 

1413 Raises 

1414 ------ 

1415 LookupError 

1416 A URI has been requested for a dataset that does not exist and 

1417 guessing is not allowed. 

1418 ValueError 

1419 Raised if a resolved `DatasetRef` was passed as an input, but it 

1420 differs from the one found in the registry. 

1421 TypeError 

1422 Raised if no collections were provided. 

1423 RuntimeError 

1424 Raised if a URI is requested for a dataset that consists of 

1425 multiple artifacts. 

1426 """ 

1427 primary, components = self.getURIs( 

1428 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1429 ) 

1430 

1431 if primary is None or components: 

1432 raise RuntimeError( 

1433 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1434 "Use Butler.getURIs() instead." 

1435 ) 

1436 return primary 

1437 

1438 def retrieveArtifacts( 

1439 self, 

1440 refs: Iterable[DatasetRef], 

1441 destination: ResourcePathExpression, 

1442 transfer: str = "auto", 

1443 preserve_path: bool = True, 

1444 overwrite: bool = False, 

1445 ) -> List[ResourcePath]: 

1446 """Retrieve the artifacts associated with the supplied refs. 

1447 

1448 Parameters 

1449 ---------- 

1450 refs : iterable of `DatasetRef` 

1451 The datasets for which artifacts are to be retrieved. 

1452 A single ref can result in multiple artifacts. The refs must 

1453 be resolved. 

1454 destination : `lsst.resources.ResourcePath` or `str` 

1455 Location to write the artifacts. 

1456 transfer : `str`, optional 

1457 Method to use to transfer the artifacts. Must be one of the options 

1458 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1459 "move" is not allowed. 

1460 preserve_path : `bool`, optional 

1461 If `True` the full path of the artifact within the datastore 

1462 is preserved. If `False` the final file component of the path 

1463 is used. 

1464 overwrite : `bool`, optional 

1465 If `True` allow transfers to overwrite existing files at the 

1466 destination. 

1467 

1468 Returns 

1469 ------- 

1470 targets : `list` of `lsst.resources.ResourcePath` 

1471 URIs of file artifacts in destination location. Order is not 

1472 preserved. 

1473 

1474 Notes 

1475 ----- 

1476 For non-file datastores the artifacts written to the destination 

1477 may not match the representation inside the datastore. For example 

1478 a hierarchical data structure in a NoSQL database may well be stored 

1479 as a JSON file. 

1480 """ 

1481 return self.datastore.retrieveArtifacts( 

1482 refs, 

1483 ResourcePath(destination), 

1484 transfer=transfer, 

1485 preserve_path=preserve_path, 

1486 overwrite=overwrite, 

1487 ) 

1488 

1489 def datasetExists( 

1490 self, 

1491 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1492 dataId: Optional[DataId] = None, 

1493 *, 

1494 collections: Any = None, 

1495 **kwargs: Any, 

1496 ) -> bool: 

1497 """Return True if the Dataset is actually present in the Datastore. 

1498 

1499 Parameters 

1500 ---------- 

1501 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1502 When `DatasetRef` the `dataId` should be `None`. 

1503 Otherwise the `DatasetType` or name thereof. 

1504 dataId : `dict` or `DataCoordinate` 

1505 A `dict` of `Dimension` link name, value pairs that label the 

1506 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1507 should be provided as the first argument. 

1508 collections : Any, optional 

1509 Collections to be searched, overriding ``self.collections``. 

1510 Can be any of the types supported by the ``collections`` argument 

1511 to butler construction. 

1512 **kwargs 

1513 Additional keyword arguments used to augment or construct a 

1514 `DataCoordinate`. See `DataCoordinate.standardize` 

1515 parameters. 

1516 

1517 Raises 

1518 ------ 

1519 LookupError 

1520 Raised if the dataset is not even present in the Registry. 

1521 ValueError 

1522 Raised if a resolved `DatasetRef` was passed as an input, but it 

1523 differs from the one found in the registry. 

1524 TypeError 

1525 Raised if no collections were provided. 

1526 """ 

1527 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1528 return self.datastore.exists(ref) 

1529 

1530 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1531 """Remove one or more `~CollectionType.RUN` collections and the 

1532 datasets within them. 

1533 

1534 Parameters 

1535 ---------- 

1536 names : `Iterable` [ `str` ] 

1537 The names of the collections to remove. 

1538 unstore : `bool`, optional 

1539 If `True` (default), delete datasets from all datastores in which 

1540 they are present, and attempt to rollback the registry deletions if 

1541 datastore deletions fail (which may not always be possible). If 

1542 `False`, datastore records for these datasets are still removed, 

1543 but any artifacts (e.g. files) will not be. 

1544 

1545 Raises 

1546 ------ 

1547 TypeError 

1548 Raised if one or more collections are not of type 

1549 `~CollectionType.RUN`. 

1550 """ 

1551 if not self.isWriteable(): 

1552 raise TypeError("Butler is read-only.") 

1553 names = list(names) 

1554 refs: List[DatasetRef] = [] 

1555 for name in names: 

1556 collectionType = self.registry.getCollectionType(name) 

1557 if collectionType is not CollectionType.RUN: 

1558 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1559 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1560 with self.registry.transaction(): 

1561 if unstore: 

1562 self.datastore.trash(refs) 

1563 else: 

1564 self.datastore.forget(refs) 

1565 for name in names: 

1566 self.registry.removeCollection(name) 

1567 if unstore: 

1568 # Point of no return for removing artifacts 

1569 self.datastore.emptyTrash() 

1570 

1571 def pruneCollection( 

1572 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1573 ) -> None: 

1574 """Remove a collection and possibly prune datasets within it. 

1575 

1576 Parameters 

1577 ---------- 

1578 name : `str` 

1579 Name of the collection to remove. If this is a 

1580 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1581 datasets within the collection are not modified unless ``unstore`` 

1582 is `True`. If this is a `~CollectionType.RUN` collection, 

1583 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1584 are fully removed from the data repository. 

1585 purge : `bool`, optional 

1586 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1587 fully removing datasets within them. Requires ``unstore=True`` as 

1588 well as an added precaution against accidental deletion. Must be 

1589 `False` (default) if the collection is not a ``RUN``. 

1590 unstore: `bool`, optional 

1591 If `True`, remove all datasets in the collection from all 

1592 datastores in which they appear. 

1593 unlink: `list` [`str`], optional 

1594 Before removing the given `collection` unlink it from from these 

1595 parent collections. 

1596 

1597 Raises 

1598 ------ 

1599 TypeError 

1600 Raised if the butler is read-only or arguments are mutually 

1601 inconsistent. 

1602 """ 

1603 # See pruneDatasets comments for more information about the logic here; 

1604 # the cases are almost the same, but here we can rely on Registry to 

1605 # take care everything but Datastore deletion when we remove the 

1606 # collection. 

1607 if not self.isWriteable(): 

1608 raise TypeError("Butler is read-only.") 

1609 collectionType = self.registry.getCollectionType(name) 

1610 if purge and not unstore: 

1611 raise PurgeWithoutUnstorePruneCollectionsError() 

1612 if collectionType is CollectionType.RUN and not purge: 

1613 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1614 if collectionType is not CollectionType.RUN and purge: 

1615 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1616 

1617 def remove(child: str, parent: str) -> None: 

1618 """Remove a child collection from a parent collection.""" 

1619 # Remove child from parent. 

1620 chain = list(self.registry.getCollectionChain(parent)) 

1621 try: 

1622 chain.remove(name) 

1623 except ValueError as e: 

1624 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1625 self.registry.setCollectionChain(parent, chain) 

1626 

1627 with self.registry.transaction(): 

1628 if unlink: 

1629 for parent in unlink: 

1630 remove(name, parent) 

1631 if unstore: 

1632 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1633 self.datastore.trash(refs) 

1634 self.registry.removeCollection(name) 

1635 

1636 if unstore: 

1637 # Point of no return for removing artifacts 

1638 self.datastore.emptyTrash() 

1639 

1640 def pruneDatasets( 

1641 self, 

1642 refs: Iterable[DatasetRef], 

1643 *, 

1644 disassociate: bool = True, 

1645 unstore: bool = False, 

1646 tags: Iterable[str] = (), 

1647 purge: bool = False, 

1648 run: Optional[str] = None, 

1649 ) -> None: 

1650 """Remove one or more datasets from a collection and/or storage. 

1651 

1652 Parameters 

1653 ---------- 

1654 refs : `~collections.abc.Iterable` of `DatasetRef` 

1655 Datasets to prune. These must be "resolved" references (not just 

1656 a `DatasetType` and data ID). 

1657 disassociate : `bool`, optional 

1658 Disassociate pruned datasets from ``tags``, or from all collections 

1659 if ``purge=True``. 

1660 unstore : `bool`, optional 

1661 If `True` (`False` is default) remove these datasets from all 

1662 datastores known to this butler. Note that this will make it 

1663 impossible to retrieve these datasets even via other collections. 

1664 Datasets that are already not stored are ignored by this option. 

1665 tags : `Iterable` [ `str` ], optional 

1666 `~CollectionType.TAGGED` collections to disassociate the datasets 

1667 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1668 `True`. 

1669 purge : `bool`, optional 

1670 If `True` (`False` is default), completely remove the dataset from 

1671 the `Registry`. To prevent accidental deletions, ``purge`` may 

1672 only be `True` if all of the following conditions are met: 

1673 

1674 - All given datasets are in the given run. 

1675 - ``disassociate`` is `True`; 

1676 - ``unstore`` is `True`. 

1677 

1678 This mode may remove provenance information from datasets other 

1679 than those provided, and should be used with extreme care. 

1680 

1681 Raises 

1682 ------ 

1683 TypeError 

1684 Raised if the butler is read-only, if no collection was provided, 

1685 or the conditions for ``purge=True`` were not met. 

1686 """ 

1687 if not self.isWriteable(): 

1688 raise TypeError("Butler is read-only.") 

1689 if purge: 

1690 if not disassociate: 

1691 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1692 if not unstore: 

1693 raise TypeError("Cannot pass purge=True without unstore=True.") 

1694 elif disassociate: 

1695 tags = tuple(tags) 

1696 if not tags: 

1697 raise TypeError("No tags provided but disassociate=True.") 

1698 for tag in tags: 

1699 collectionType = self.registry.getCollectionType(tag) 

1700 if collectionType is not CollectionType.TAGGED: 

1701 raise TypeError( 

1702 f"Cannot disassociate from collection '{tag}' " 

1703 f"of non-TAGGED type {collectionType.name}." 

1704 ) 

1705 # Transform possibly-single-pass iterable into something we can iterate 

1706 # over multiple times. 

1707 refs = list(refs) 

1708 # Pruning a component of a DatasetRef makes no sense since registry 

1709 # doesn't know about components and datastore might not store 

1710 # components in a separate file 

1711 for ref in refs: 

1712 if ref.datasetType.component(): 

1713 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1714 # We don't need an unreliable Datastore transaction for this, because 

1715 # we've been extra careful to ensure that Datastore.trash only involves 

1716 # mutating the Registry (it can _look_ at Datastore-specific things, 

1717 # but shouldn't change them), and hence all operations here are 

1718 # Registry operations. 

1719 with self.registry.transaction(): 

1720 if unstore: 

1721 self.datastore.trash(refs) 

1722 if purge: 

1723 self.registry.removeDatasets(refs) 

1724 elif disassociate: 

1725 assert tags, "Guaranteed by earlier logic in this function." 

1726 for tag in tags: 

1727 self.registry.disassociate(tag, refs) 

1728 # We've exited the Registry transaction, and apparently committed. 

1729 # (if there was an exception, everything rolled back, and it's as if 

1730 # nothing happened - and we never get here). 

1731 # Datastore artifacts are not yet gone, but they're clearly marked 

1732 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1733 # problems we can try again later, and if manual administrative 

1734 # intervention is required, it's pretty clear what that should entail: 

1735 # deleting everything on disk and in private Datastore tables that is 

1736 # in the dataset_location_trash table. 

1737 if unstore: 

1738 # Point of no return for removing artifacts 

1739 self.datastore.emptyTrash() 

1740 

1741 @transactional 

1742 def ingest( 

1743 self, 

1744 *datasets: FileDataset, 

1745 transfer: Optional[str] = "auto", 

1746 run: Optional[str] = None, 

1747 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1748 record_validation_info: bool = True, 

1749 ) -> None: 

1750 """Store and register one or more datasets that already exist on disk. 

1751 

1752 Parameters 

1753 ---------- 

1754 datasets : `FileDataset` 

1755 Each positional argument is a struct containing information about 

1756 a file to be ingested, including its URI (either absolute or 

1757 relative to the datastore root, if applicable), a `DatasetRef`, 

1758 and optionally a formatter class or its fully-qualified string 

1759 name. If a formatter is not provided, the formatter that would be 

1760 used for `put` is assumed. On successful return, all 

1761 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1762 attribute populated and all `FileDataset.formatter` attributes will 

1763 be set to the formatter class used. `FileDataset.path` attributes 

1764 may be modified to put paths in whatever the datastore considers a 

1765 standardized form. 

1766 transfer : `str`, optional 

1767 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1768 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1769 transfer the file. 

1770 run : `str`, optional 

1771 The name of the run ingested datasets should be added to, 

1772 overriding ``self.run``. 

1773 idGenerationMode : `DatasetIdGenEnum`, optional 

1774 Specifies option for generating dataset IDs. By default unique IDs 

1775 are generated for each inserted dataset. 

1776 record_validation_info : `bool`, optional 

1777 If `True`, the default, the datastore can record validation 

1778 information associated with the file. If `False` the datastore 

1779 will not attempt to track any information such as checksums 

1780 or file sizes. This can be useful if such information is tracked 

1781 in an external system or if the file is to be compressed in place. 

1782 It is up to the datastore whether this parameter is relevant. 

1783 

1784 Raises 

1785 ------ 

1786 TypeError 

1787 Raised if the butler is read-only or if no run was provided. 

1788 NotImplementedError 

1789 Raised if the `Datastore` does not support the given transfer mode. 

1790 DatasetTypeNotSupportedError 

1791 Raised if one or more files to be ingested have a dataset type that 

1792 is not supported by the `Datastore`.. 

1793 FileNotFoundError 

1794 Raised if one of the given files does not exist. 

1795 FileExistsError 

1796 Raised if transfer is not `None` but the (internal) location the 

1797 file would be moved to is already occupied. 

1798 

1799 Notes 

1800 ----- 

1801 This operation is not fully exception safe: if a database operation 

1802 fails, the given `FileDataset` instances may be only partially updated. 

1803 

1804 It is atomic in terms of database operations (they will either all 

1805 succeed or all fail) providing the database engine implements 

1806 transactions correctly. It will attempt to be atomic in terms of 

1807 filesystem operations as well, but this cannot be implemented 

1808 rigorously for most datastores. 

1809 """ 

1810 if not self.isWriteable(): 

1811 raise TypeError("Butler is read-only.") 

1812 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1813 # Reorganize the inputs so they're grouped by DatasetType and then 

1814 # data ID. We also include a list of DatasetRefs for each FileDataset 

1815 # to hold the resolved DatasetRefs returned by the Registry, before 

1816 # it's safe to swap them into FileDataset.refs. 

1817 # Some type annotation aliases to make that clearer: 

1818 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1819 GroupedData = MutableMapping[DatasetType, GroupForType] 

1820 # The actual data structure: 

1821 groupedData: GroupedData = defaultdict(dict) 

1822 # And the nested loop that populates it: 

1823 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1824 # This list intentionally shared across the inner loop, since it's 

1825 # associated with `dataset`. 

1826 resolvedRefs: List[DatasetRef] = [] 

1827 

1828 # Somewhere to store pre-existing refs if we have an 

1829 # execution butler. 

1830 existingRefs: List[DatasetRef] = [] 

1831 

1832 for ref in dataset.refs: 

1833 if ref.dataId in groupedData[ref.datasetType]: 

1834 raise ConflictingDefinitionError( 

1835 f"Ingest conflict. Dataset {dataset.path} has same" 

1836 " DataId as other ingest dataset" 

1837 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1838 f" ({ref.dataId})" 

1839 ) 

1840 if self._allow_put_of_predefined_dataset: 

1841 existing_ref = self.registry.findDataset( 

1842 ref.datasetType, dataId=ref.dataId, collections=run 

1843 ) 

1844 if existing_ref: 

1845 if self.datastore.knows(existing_ref): 

1846 raise ConflictingDefinitionError( 

1847 f"Dataset associated with path {dataset.path}" 

1848 f" already exists as {existing_ref}." 

1849 ) 

1850 # Store this ref elsewhere since it already exists 

1851 # and we do not want to remake it but we do want 

1852 # to store it in the datastore. 

1853 existingRefs.append(existing_ref) 

1854 

1855 # Nothing else to do until we have finished 

1856 # iterating. 

1857 continue 

1858 

1859 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1860 

1861 if existingRefs: 

1862 

1863 if len(dataset.refs) != len(existingRefs): 

1864 # Keeping track of partially pre-existing datasets is hard 

1865 # and should generally never happen. For now don't allow 

1866 # it. 

1867 raise ConflictingDefinitionError( 

1868 f"For dataset {dataset.path} some dataIds already exist" 

1869 " in registry but others do not. This is not supported." 

1870 ) 

1871 

1872 # Attach the resolved refs if we found them. 

1873 dataset.refs = existingRefs 

1874 

1875 # Now we can bulk-insert into Registry for each DatasetType. 

1876 for datasetType, groupForType in progress.iter_item_chunks( 

1877 groupedData.items(), desc="Bulk-inserting datasets by type" 

1878 ): 

1879 refs = self.registry.insertDatasets( 

1880 datasetType, 

1881 dataIds=groupForType.keys(), 

1882 run=run, 

1883 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1884 idGenerationMode=idGenerationMode, 

1885 ) 

1886 # Append those resolved DatasetRefs to the new lists we set up for 

1887 # them. 

1888 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1889 resolvedRefs.append(ref) 

1890 

1891 # Go back to the original FileDatasets to replace their refs with the 

1892 # new resolved ones. 

1893 for groupForType in progress.iter_chunks( 

1894 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1895 ): 

1896 for dataset, resolvedRefs in groupForType.values(): 

1897 dataset.refs = resolvedRefs 

1898 

1899 # Bulk-insert everything into Datastore. 

1900 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1901 

1902 @contextlib.contextmanager 

1903 def export( 

1904 self, 

1905 *, 

1906 directory: Optional[str] = None, 

1907 filename: Optional[str] = None, 

1908 format: Optional[str] = None, 

1909 transfer: Optional[str] = None, 

1910 ) -> Iterator[RepoExportContext]: 

1911 """Export datasets from the repository represented by this `Butler`. 

1912 

1913 This method is a context manager that returns a helper object 

1914 (`RepoExportContext`) that is used to indicate what information from 

1915 the repository should be exported. 

1916 

1917 Parameters 

1918 ---------- 

1919 directory : `str`, optional 

1920 Directory dataset files should be written to if ``transfer`` is not 

1921 `None`. 

1922 filename : `str`, optional 

1923 Name for the file that will include database information associated 

1924 with the exported datasets. If this is not an absolute path and 

1925 ``directory`` is not `None`, it will be written to ``directory`` 

1926 instead of the current working directory. Defaults to 

1927 "export.{format}". 

1928 format : `str`, optional 

1929 File format for the database information file. If `None`, the 

1930 extension of ``filename`` will be used. 

1931 transfer : `str`, optional 

1932 Transfer mode passed to `Datastore.export`. 

1933 

1934 Raises 

1935 ------ 

1936 TypeError 

1937 Raised if the set of arguments passed is inconsistent. 

1938 

1939 Examples 

1940 -------- 

1941 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1942 methods are used to provide the iterables over data IDs and/or datasets 

1943 to be exported:: 

1944 

1945 with butler.export("exports.yaml") as export: 

1946 # Export all flats, but none of the dimension element rows 

1947 # (i.e. data ID information) associated with them. 

1948 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1949 elements=()) 

1950 # Export all datasets that start with "deepCoadd_" and all of 

1951 # their associated data ID information. 

1952 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1953 """ 

1954 if directory is None and transfer is not None: 

1955 raise TypeError("Cannot transfer without providing a directory.") 

1956 if transfer == "move": 

1957 raise TypeError("Transfer may not be 'move': export is read-only") 

1958 if format is None: 

1959 if filename is None: 

1960 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1961 else: 

1962 _, format = os.path.splitext(filename) 

1963 elif filename is None: 

1964 filename = f"export.{format}" 

1965 if directory is not None: 

1966 filename = os.path.join(directory, filename) 

1967 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

1968 with open(filename, "w") as stream: 

1969 backend = BackendClass(stream) 

1970 try: 

1971 helper = RepoExportContext( 

1972 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

1973 ) 

1974 yield helper 

1975 except BaseException: 

1976 raise 

1977 else: 

1978 helper._finish() 

1979 

1980 def import_( 

1981 self, 

1982 *, 

1983 directory: Optional[str] = None, 

1984 filename: Union[str, TextIO, None] = None, 

1985 format: Optional[str] = None, 

1986 transfer: Optional[str] = None, 

1987 skip_dimensions: Optional[Set] = None, 

1988 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1989 reuseIds: bool = False, 

1990 ) -> None: 

1991 """Import datasets into this repository that were exported from a 

1992 different butler repository via `~lsst.daf.butler.Butler.export`. 

1993 

1994 Parameters 

1995 ---------- 

1996 directory : `str`, optional 

1997 Directory containing dataset files to import from. If `None`, 

1998 ``filename`` and all dataset file paths specified therein must 

1999 be absolute. 

2000 filename : `str` or `TextIO`, optional 

2001 A stream or name of file that contains database information 

2002 associated with the exported datasets, typically generated by 

2003 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

2004 is not an absolute path, does not exist in the current working 

2005 directory, and ``directory`` is not `None`, it is assumed to be in 

2006 ``directory``. Defaults to "export.{format}". 

2007 format : `str`, optional 

2008 File format for ``filename``. If `None`, the extension of 

2009 ``filename`` will be used. 

2010 transfer : `str`, optional 

2011 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2012 skip_dimensions : `set`, optional 

2013 Names of dimensions that should be skipped and not imported. 

2014 idGenerationMode : `DatasetIdGenEnum`, optional 

2015 Specifies option for generating dataset IDs when IDs are not 

2016 provided or their type does not match backend type. By default 

2017 unique IDs are generated for each inserted dataset. 

2018 reuseIds : `bool`, optional 

2019 If `True` then forces re-use of imported dataset IDs for integer 

2020 IDs which are normally generated as auto-incremented; exception 

2021 will be raised if imported IDs clash with existing ones. This 

2022 option has no effect on the use of globally-unique IDs which are 

2023 always re-used (or generated if integer IDs are being imported). 

2024 

2025 Raises 

2026 ------ 

2027 TypeError 

2028 Raised if the set of arguments passed is inconsistent, or if the 

2029 butler is read-only. 

2030 """ 

2031 if not self.isWriteable(): 

2032 raise TypeError("Butler is read-only.") 

2033 if format is None: 

2034 if filename is None: 

2035 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2036 else: 

2037 _, format = os.path.splitext(filename) # type: ignore 

2038 elif filename is None: 

2039 filename = f"export.{format}" 

2040 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2041 filename = os.path.join(directory, filename) 

2042 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2043 

2044 def doImport(importStream: TextIO) -> None: 

2045 backend = BackendClass(importStream, self.registry) 

2046 backend.register() 

2047 with self.transaction(): 

2048 backend.load( 

2049 self.datastore, 

2050 directory=directory, 

2051 transfer=transfer, 

2052 skip_dimensions=skip_dimensions, 

2053 idGenerationMode=idGenerationMode, 

2054 reuseIds=reuseIds, 

2055 ) 

2056 

2057 if isinstance(filename, str): 

2058 with open(filename, "r") as stream: 

2059 doImport(stream) 

2060 else: 

2061 doImport(filename) 

2062 

2063 def transfer_from( 

2064 self, 

2065 source_butler: Butler, 

2066 source_refs: Iterable[DatasetRef], 

2067 transfer: str = "auto", 

2068 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

2069 skip_missing: bool = True, 

2070 register_dataset_types: bool = False, 

2071 ) -> List[DatasetRef]: 

2072 """Transfer datasets to this Butler from a run in another Butler. 

2073 

2074 Parameters 

2075 ---------- 

2076 source_butler : `Butler` 

2077 Butler from which the datasets are to be transferred. 

2078 source_refs : iterable of `DatasetRef` 

2079 Datasets defined in the source butler that should be transferred to 

2080 this butler. 

2081 transfer : `str`, optional 

2082 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2083 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2084 A mapping of dataset type to ID generation mode. Only used if 

2085 the source butler is using integer IDs. Should not be used 

2086 if this receiving butler uses integer IDs. Without this dataset 

2087 import always uses unique. 

2088 skip_missing : `bool` 

2089 If `True`, datasets with no datastore artifact associated with 

2090 them are not transferred. If `False` a registry entry will be 

2091 created even if no datastore record is created (and so will 

2092 look equivalent to the dataset being unstored). 

2093 register_dataset_types : `bool` 

2094 If `True` any missing dataset types are registered. Otherwise 

2095 an exception is raised. 

2096 

2097 Returns 

2098 ------- 

2099 refs : `list` of `DatasetRef` 

2100 The refs added to this Butler. 

2101 

2102 Notes 

2103 ----- 

2104 Requires that any dimension definitions are already present in the 

2105 receiving Butler. The datastore artifact has to exist for a transfer 

2106 to be made but non-existence is not an error. 

2107 

2108 Datasets that already exist in this run will be skipped. 

2109 

2110 The datasets are imported as part of a transaction, although 

2111 dataset types are registered before the transaction is started. 

2112 This means that it is possible for a dataset type to be registered 

2113 even though transfer has failed. 

2114 """ 

2115 if not self.isWriteable(): 

2116 raise TypeError("Butler is read-only.") 

2117 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2118 

2119 # Will iterate through the refs multiple times so need to convert 

2120 # to a list if this isn't a collection. 

2121 if not isinstance(source_refs, collections.abc.Collection): 

2122 source_refs = list(source_refs) 

2123 

2124 original_count = len(source_refs) 

2125 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2126 

2127 if id_gen_map is None: 

2128 id_gen_map = {} 

2129 

2130 # In some situations the datastore artifact may be missing 

2131 # and we do not want that registry entry to be imported. 

2132 # Asking datastore is not sufficient, the records may have been 

2133 # purged, we have to ask for the (predicted) URI and check 

2134 # existence explicitly. Execution butler is set up exactly like 

2135 # this with no datastore records. 

2136 artifact_existence: Dict[ResourcePath, bool] = {} 

2137 if skip_missing: 

2138 dataset_existence = source_butler.datastore.mexists( 

2139 source_refs, artifact_existence=artifact_existence 

2140 ) 

2141 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2142 filtered_count = len(source_refs) 

2143 log.verbose( 

2144 "%d datasets removed because the artifact does not exist. Now have %d.", 

2145 original_count - filtered_count, 

2146 filtered_count, 

2147 ) 

2148 

2149 # Importing requires that we group the refs by dataset type and run 

2150 # before doing the import. 

2151 source_dataset_types = set() 

2152 grouped_refs = defaultdict(list) 

2153 grouped_indices = defaultdict(list) 

2154 for i, ref in enumerate(source_refs): 

2155 grouped_refs[ref.datasetType, ref.run].append(ref) 

2156 grouped_indices[ref.datasetType, ref.run].append(i) 

2157 source_dataset_types.add(ref.datasetType) 

2158 

2159 # Check to see if the dataset type in the source butler has 

2160 # the same definition in the target butler and register missing 

2161 # ones if requested. Registration must happen outside a transaction. 

2162 newly_registered_dataset_types = set() 

2163 for datasetType in source_dataset_types: 

2164 if register_dataset_types: 

2165 # Let this raise immediately if inconsistent. Continuing 

2166 # on to find additional inconsistent dataset types 

2167 # might result in additional unwanted dataset types being 

2168 # registered. 

2169 if self.registry.registerDatasetType(datasetType): 

2170 newly_registered_dataset_types.add(datasetType) 

2171 else: 

2172 # If the dataset type is missing, let it fail immediately. 

2173 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2174 if target_dataset_type != datasetType: 

2175 raise ConflictingDefinitionError( 

2176 "Source butler dataset type differs from definition" 

2177 f" in target butler: {datasetType} !=" 

2178 f" {target_dataset_type}" 

2179 ) 

2180 if newly_registered_dataset_types: 

2181 # We may have registered some even if there were inconsistencies 

2182 # but should let people know (or else remove them again). 

2183 log.log( 

2184 VERBOSE, 

2185 "Registered the following dataset types in the target Butler: %s", 

2186 ", ".join(d.name for d in newly_registered_dataset_types), 

2187 ) 

2188 else: 

2189 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2190 

2191 # The returned refs should be identical for UUIDs. 

2192 # For now must also support integers and so need to retain the 

2193 # newly-created refs from this registry. 

2194 # Pre-size it so we can assign refs into the correct slots 

2195 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2196 default_id_gen = DatasetIdGenEnum.UNIQUE 

2197 

2198 handled_collections: Set[str] = set() 

2199 

2200 # Do all the importing in a single transaction. 

2201 with self.transaction(): 

2202 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2203 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2204 ): 

2205 if run not in handled_collections: 

2206 run_doc = source_butler.registry.getCollectionDocumentation(run) 

2207 registered = self.registry.registerRun(run, doc=run_doc) 

2208 handled_collections.add(run) 

2209 if registered: 

2210 log.log(VERBOSE, "Creating output run %s", run) 

2211 

2212 id_generation_mode = default_id_gen 

2213 if isinstance(refs_to_import[0].id, int): 

2214 # ID generation mode might need to be overridden when 

2215 # targetting UUID 

2216 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2217 

2218 n_refs = len(refs_to_import) 

2219 log.verbose( 

2220 "Importing %d ref%s of dataset type %s into run %s", 

2221 n_refs, 

2222 "" if n_refs == 1 else "s", 

2223 datasetType.name, 

2224 run, 

2225 ) 

2226 

2227 # No way to know if this butler's registry uses UUID. 

2228 # We have to trust the caller on this. If it fails they will 

2229 # have to change their approach. We can't catch the exception 

2230 # and retry with unique because that will mess up the 

2231 # transaction handling. We aren't allowed to ask the registry 

2232 # manager what type of ID it is using. 

2233 imported_refs = self.registry._importDatasets( 

2234 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2235 ) 

2236 

2237 # Map them into the correct slots to match the initial order 

2238 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2239 transferred_refs_tmp[i] = ref 

2240 

2241 # Mypy insists that we might have None in here so we have to make 

2242 # that explicit by assigning to a new variable and filtering out 

2243 # something that won't be there. 

2244 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2245 

2246 # Check consistency 

2247 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2248 

2249 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2250 

2251 # The transferred refs need to be reordered to match the original 

2252 # ordering given by the caller. Without this the datastore transfer 

2253 # will be broken. 

2254 

2255 # Ask the datastore to transfer. The datastore has to check that 

2256 # the source datastore is compatible with the target datastore. 

2257 self.datastore.transfer_from( 

2258 source_butler.datastore, 

2259 source_refs, 

2260 local_refs=transferred_refs, 

2261 transfer=transfer, 

2262 artifact_existence=artifact_existence, 

2263 ) 

2264 

2265 return transferred_refs 

2266 

2267 def validateConfiguration( 

2268 self, 

2269 logFailures: bool = False, 

2270 datasetTypeNames: Optional[Iterable[str]] = None, 

2271 ignore: Iterable[str] = None, 

2272 ) -> None: 

2273 """Validate butler configuration. 

2274 

2275 Checks that each `DatasetType` can be stored in the `Datastore`. 

2276 

2277 Parameters 

2278 ---------- 

2279 logFailures : `bool`, optional 

2280 If `True`, output a log message for every validation error 

2281 detected. 

2282 datasetTypeNames : iterable of `str`, optional 

2283 The `DatasetType` names that should be checked. This allows 

2284 only a subset to be selected. 

2285 ignore : iterable of `str`, optional 

2286 Names of DatasetTypes to skip over. This can be used to skip 

2287 known problems. If a named `DatasetType` corresponds to a 

2288 composite, all components of that `DatasetType` will also be 

2289 ignored. 

2290 

2291 Raises 

2292 ------ 

2293 ButlerValidationError 

2294 Raised if there is some inconsistency with how this Butler 

2295 is configured. 

2296 """ 

2297 if datasetTypeNames: 

2298 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2299 else: 

2300 datasetTypes = list(self.registry.queryDatasetTypes()) 

2301 

2302 # filter out anything from the ignore list 

2303 if ignore: 

2304 ignore = set(ignore) 

2305 datasetTypes = [ 

2306 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2307 ] 

2308 else: 

2309 ignore = set() 

2310 

2311 # Find all the registered instruments 

2312 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2313 

2314 # For each datasetType that has an instrument dimension, create 

2315 # a DatasetRef for each defined instrument 

2316 datasetRefs = [] 

2317 

2318 for datasetType in datasetTypes: 

2319 if "instrument" in datasetType.dimensions: 

2320 for instrument in instruments: 

2321 datasetRef = DatasetRef( 

2322 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2323 ) 

2324 datasetRefs.append(datasetRef) 

2325 

2326 entities: List[Union[DatasetType, DatasetRef]] = [] 

2327 entities.extend(datasetTypes) 

2328 entities.extend(datasetRefs) 

2329 

2330 datastoreErrorStr = None 

2331 try: 

2332 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2333 except ValidationError as e: 

2334 datastoreErrorStr = str(e) 

2335 

2336 # Also check that the LookupKeys used by the datastores match 

2337 # registry and storage class definitions 

2338 keys = self.datastore.getLookupKeys() 

2339 

2340 failedNames = set() 

2341 failedDataId = set() 

2342 for key in keys: 

2343 if key.name is not None: 

2344 if key.name in ignore: 

2345 continue 

2346 

2347 # skip if specific datasetType names were requested and this 

2348 # name does not match 

2349 if datasetTypeNames and key.name not in datasetTypeNames: 

2350 continue 

2351 

2352 # See if it is a StorageClass or a DatasetType 

2353 if key.name in self.storageClasses: 

2354 pass 

2355 else: 

2356 try: 

2357 self.registry.getDatasetType(key.name) 

2358 except KeyError: 

2359 if logFailures: 

2360 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2361 failedNames.add(key) 

2362 else: 

2363 # Dimensions are checked for consistency when the Butler 

2364 # is created and rendezvoused with a universe. 

2365 pass 

2366 

2367 # Check that the instrument is a valid instrument 

2368 # Currently only support instrument so check for that 

2369 if key.dataId: 

2370 dataIdKeys = set(key.dataId) 

2371 if set(["instrument"]) != dataIdKeys: 

2372 if logFailures: 

2373 log.critical("Key '%s' has unsupported DataId override", key) 

2374 failedDataId.add(key) 

2375 elif key.dataId["instrument"] not in instruments: 

2376 if logFailures: 

2377 log.critical("Key '%s' has unknown instrument", key) 

2378 failedDataId.add(key) 

2379 

2380 messages = [] 

2381 

2382 if datastoreErrorStr: 

2383 messages.append(datastoreErrorStr) 

2384 

2385 for failed, msg in ( 

2386 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2387 (failedDataId, "Keys with bad DataId entries: "), 

2388 ): 

2389 if failed: 

2390 msg += ", ".join(str(k) for k in failed) 

2391 messages.append(msg) 

2392 

2393 if messages: 

2394 raise ValidationError(";\n".join(messages)) 

2395 

2396 @property 

2397 def collections(self) -> CollectionSearch: 

2398 """The collections to search by default, in order (`CollectionSearch`). 

2399 

2400 This is an alias for ``self.registry.defaults.collections``. It cannot 

2401 be set directly in isolation, but all defaults may be changed together 

2402 by assigning a new `RegistryDefaults` instance to 

2403 ``self.registry.defaults``. 

2404 """ 

2405 return self.registry.defaults.collections 

2406 

2407 @property 

2408 def run(self) -> Optional[str]: 

2409 """Name of the run this butler writes outputs to by default (`str` or 

2410 `None`). 

2411 

2412 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2413 directly in isolation, but all defaults may be changed together by 

2414 assigning a new `RegistryDefaults` instance to 

2415 ``self.registry.defaults``. 

2416 """ 

2417 return self.registry.defaults.run 

2418 

2419 registry: Registry 

2420 """The object that manages dataset metadata and relationships (`Registry`). 

2421 

2422 Most operations that don't involve reading or writing butler datasets are 

2423 accessible only via `Registry` methods. 

2424 """ 

2425 

2426 datastore: Datastore 

2427 """The object that manages actual dataset storage (`Datastore`). 

2428 

2429 Direct user access to the datastore should rarely be necessary; the primary 

2430 exception is the case where a `Datastore` implementation provides extra 

2431 functionality beyond what the base class defines. 

2432 """ 

2433 

2434 storageClasses: StorageClassFactory 

2435 """An object that maps known storage class names to objects that fully 

2436 describe them (`StorageClassFactory`). 

2437 """ 

2438 

2439 _allow_put_of_predefined_dataset: bool 

2440 """Allow a put to succeed even if there is already a registry entry for it 

2441 but not a datastore record. (`bool`)."""