Coverage for python/lsst/daf/butler/_butler.py: 9%

687 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-09-30 02:19 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_class_of 

62from lsst.utils.logging import VERBOSE, getLogger 

63 

64from ._butlerConfig import ButlerConfig 

65from ._butlerRepoIndex import ButlerRepoIndex 

66from ._deferredDatasetHandle import DeferredDatasetHandle 

67from ._limited_butler import LimitedButler 

68from .core import ( 

69 AmbiguousDatasetError, 

70 Config, 

71 ConfigSubset, 

72 DataCoordinate, 

73 DataId, 

74 DataIdValue, 

75 DatasetRef, 

76 DatasetRefURIs, 

77 DatasetType, 

78 Datastore, 

79 Dimension, 

80 DimensionConfig, 

81 DimensionElement, 

82 DimensionRecord, 

83 DimensionUniverse, 

84 FileDataset, 

85 Progress, 

86 StorageClassFactory, 

87 Timespan, 

88 ValidationError, 

89) 

90from .core.repoRelocation import BUTLER_ROOT_TAG 

91from .core.utils import transactional 

92from .registry import ( 

93 CollectionSearch, 

94 CollectionType, 

95 ConflictingDefinitionError, 

96 DataIdError, 

97 DatasetIdGenEnum, 

98 Registry, 

99 RegistryConfig, 

100 RegistryDefaults, 

101) 

102from .transfers import RepoExportContext 

103 

104log = getLogger(__name__) 

105 

106 

107class ButlerValidationError(ValidationError): 

108 """There is a problem with the Butler configuration.""" 

109 

110 pass 

111 

112 

113class PruneCollectionsArgsError(TypeError): 

114 """Base class for errors relating to Butler.pruneCollections input 

115 arguments. 

116 """ 

117 

118 pass 

119 

120 

121class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

122 """Raised when purge and unstore are both required to be True, and 

123 purge is True but unstore is False. 

124 """ 

125 

126 def __init__(self) -> None: 

127 super().__init__("Cannot pass purge=True without unstore=True.") 

128 

129 

130class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

131 """Raised when pruning a RUN collection but purge is False.""" 

132 

133 def __init__(self, collectionType: CollectionType): 

134 self.collectionType = collectionType 

135 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

136 

137 

138class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

139 """Raised when purge is True but is not supported for the given 

140 collection.""" 

141 

142 def __init__(self, collectionType: CollectionType): 

143 self.collectionType = collectionType 

144 super().__init__( 

145 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

146 ) 

147 

148 

149class Butler(LimitedButler): 

150 """Main entry point for the data access system. 

151 

152 Parameters 

153 ---------- 

154 config : `ButlerConfig`, `Config` or `str`, optional. 

155 Configuration. Anything acceptable to the 

156 `ButlerConfig` constructor. If a directory path 

157 is given the configuration will be read from a ``butler.yaml`` file in 

158 that location. If `None` is given default values will be used. 

159 butler : `Butler`, optional. 

160 If provided, construct a new Butler that uses the same registry and 

161 datastore as the given one, but with the given collection and run. 

162 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

163 arguments. 

164 collections : `str` or `Iterable` [ `str` ], optional 

165 An expression specifying the collections to be searched (in order) when 

166 reading datasets. 

167 This may be a `str` collection name or an iterable thereof. 

168 See :ref:`daf_butler_collection_expressions` for more information. 

169 These collections are not registered automatically and must be 

170 manually registered before they are used by any method, but they may be 

171 manually registered after the `Butler` is initialized. 

172 run : `str`, optional 

173 Name of the `~CollectionType.RUN` collection new datasets should be 

174 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

175 ``collections`` will be set to ``[run]``. If not `None`, this 

176 collection will automatically be registered. If this is not set (and 

177 ``writeable`` is not set either), a read-only butler will be created. 

178 searchPaths : `list` of `str`, optional 

179 Directory paths to search when calculating the full Butler 

180 configuration. Not used if the supplied config is already a 

181 `ButlerConfig`. 

182 writeable : `bool`, optional 

183 Explicitly sets whether the butler supports write operations. If not 

184 provided, a read-write butler is created if any of ``run``, ``tags``, 

185 or ``chains`` is non-empty. 

186 inferDefaults : `bool`, optional 

187 If `True` (default) infer default data ID values from the values 

188 present in the datasets in ``collections``: if all collections have the 

189 same value (or no value) for a governor dimension, that value will be 

190 the default for that dimension. Nonexistent collections are ignored. 

191 If a default value is provided explicitly for a governor dimension via 

192 ``**kwargs``, no default will be inferred for that dimension. 

193 **kwargs : `str` 

194 Default data ID key-value pairs. These may only identify "governor" 

195 dimensions like ``instrument`` and ``skymap``. 

196 

197 Examples 

198 -------- 

199 While there are many ways to control exactly how a `Butler` interacts with 

200 the collections in its `Registry`, the most common cases are still simple. 

201 

202 For a read-only `Butler` that searches one collection, do:: 

203 

204 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

205 

206 For a read-write `Butler` that writes to and reads from a 

207 `~CollectionType.RUN` collection:: 

208 

209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

210 

211 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

212 because we want to write to one `~CollectionType.RUN` collection but read 

213 from several others (as well):: 

214 

215 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

216 collections=["u/alice/DM-50000/a", 

217 "u/bob/DM-49998", 

218 "HSC/defaults"]) 

219 

220 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

221 Datasets will be read first from that run (since it appears first in the 

222 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

223 

224 Finally, one can always create a `Butler` with no collections:: 

225 

226 butler = Butler("/path/to/repo", writeable=True) 

227 

228 This can be extremely useful when you just want to use ``butler.registry``, 

229 e.g. for inserting dimension data or managing collections, or when the 

230 collections you want to use with the butler are not consistent. 

231 Passing ``writeable`` explicitly here is only necessary if you want to be 

232 able to make changes to the repo - usually the value for ``writeable`` can 

233 be guessed from the collection arguments provided, but it defaults to 

234 `False` when there are not collection arguments. 

235 """ 

236 

237 def __init__( 

238 self, 

239 config: Union[Config, str, None] = None, 

240 *, 

241 butler: Optional[Butler] = None, 

242 collections: Any = None, 

243 run: Optional[str] = None, 

244 searchPaths: Optional[List[str]] = None, 

245 writeable: Optional[bool] = None, 

246 inferDefaults: bool = True, 

247 **kwargs: str, 

248 ): 

249 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

250 # Load registry, datastore, etc. from config or existing butler. 

251 if butler is not None: 

252 if config is not None or searchPaths is not None or writeable is not None: 

253 raise TypeError( 

254 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

255 ) 

256 self.registry = butler.registry.copy(defaults) 

257 self.datastore = butler.datastore 

258 self.storageClasses = butler.storageClasses 

259 self._config: ButlerConfig = butler._config 

260 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

261 else: 

262 # Can only look for strings in the known repos list. 

263 if isinstance(config, str) and config in self.get_known_repos(): 

264 config = str(self.get_repo_uri(config)) 

265 try: 

266 self._config = ButlerConfig(config, searchPaths=searchPaths) 

267 except FileNotFoundError as e: 

268 if known := self.get_known_repos(): 

269 aliases = f"(known aliases: {', '.join(known)})" 

270 else: 

271 aliases = "(no known aliases)" 

272 raise FileNotFoundError(f"{e} {aliases}") from e 

273 self._config = ButlerConfig(config, searchPaths=searchPaths) 

274 try: 

275 if "root" in self._config: 

276 butlerRoot = self._config["root"] 

277 else: 

278 butlerRoot = self._config.configDir 

279 if writeable is None: 

280 writeable = run is not None 

281 self.registry = Registry.fromConfig( 

282 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

283 ) 

284 self.datastore = Datastore.fromConfig( 

285 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

286 ) 

287 self.storageClasses = StorageClassFactory() 

288 self.storageClasses.addFromConfig(self._config) 

289 self._allow_put_of_predefined_dataset = self._config.get( 

290 "allow_put_of_predefined_dataset", False 

291 ) 

292 except Exception: 

293 # Failures here usually mean that configuration is incomplete, 

294 # just issue an error message which includes config file URI. 

295 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

296 raise 

297 

298 if "run" in self._config or "collection" in self._config: 

299 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

300 

301 GENERATION: ClassVar[int] = 3 

302 """This is a Generation 3 Butler. 

303 

304 This attribute may be removed in the future, once the Generation 2 Butler 

305 interface has been fully retired; it should only be used in transitional 

306 code. 

307 """ 

308 

309 @classmethod 

310 def get_repo_uri(cls, label: str) -> ResourcePath: 

311 """Look up the label in a butler repository index. 

312 

313 Parameters 

314 ---------- 

315 label : `str` 

316 Label of the Butler repository to look up. 

317 

318 Returns 

319 ------- 

320 uri : `lsst.resources.ResourcePath` 

321 URI to the Butler repository associated with the given label. 

322 

323 Raises 

324 ------ 

325 KeyError 

326 Raised if the label is not found in the index, or if an index 

327 can not be found at all. 

328 

329 Notes 

330 ----- 

331 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

332 information is discovered. 

333 """ 

334 return ButlerRepoIndex.get_repo_uri(label) 

335 

336 @classmethod 

337 def get_known_repos(cls) -> Set[str]: 

338 """Retrieve the list of known repository labels. 

339 

340 Returns 

341 ------- 

342 repos : `set` of `str` 

343 All the known labels. Can be empty if no index can be found. 

344 

345 Notes 

346 ----- 

347 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

348 information is discovered. 

349 """ 

350 return ButlerRepoIndex.get_known_repos() 

351 

352 @staticmethod 

353 def makeRepo( 

354 root: ResourcePathExpression, 

355 config: Union[Config, str, None] = None, 

356 dimensionConfig: Union[Config, str, None] = None, 

357 standalone: bool = False, 

358 searchPaths: Optional[List[str]] = None, 

359 forceConfigRoot: bool = True, 

360 outfile: Optional[ResourcePathExpression] = None, 

361 overwrite: bool = False, 

362 ) -> Config: 

363 """Create an empty data repository by adding a butler.yaml config 

364 to a repository root directory. 

365 

366 Parameters 

367 ---------- 

368 root : `lsst.resources.ResourcePathExpression` 

369 Path or URI to the root location of the new repository. Will be 

370 created if it does not exist. 

371 config : `Config` or `str`, optional 

372 Configuration to write to the repository, after setting any 

373 root-dependent Registry or Datastore config options. Can not 

374 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

375 configuration will be used. Root-dependent config options 

376 specified in this config are overwritten if ``forceConfigRoot`` 

377 is `True`. 

378 dimensionConfig : `Config` or `str`, optional 

379 Configuration for dimensions, will be used to initialize registry 

380 database. 

381 standalone : `bool` 

382 If True, write all expanded defaults, not just customized or 

383 repository-specific settings. 

384 This (mostly) decouples the repository from the default 

385 configuration, insulating it from changes to the defaults (which 

386 may be good or bad, depending on the nature of the changes). 

387 Future *additions* to the defaults will still be picked up when 

388 initializing `Butlers` to repos created with ``standalone=True``. 

389 searchPaths : `list` of `str`, optional 

390 Directory paths to search when calculating the full butler 

391 configuration. 

392 forceConfigRoot : `bool`, optional 

393 If `False`, any values present in the supplied ``config`` that 

394 would normally be reset are not overridden and will appear 

395 directly in the output config. This allows non-standard overrides 

396 of the root directory for a datastore or registry to be given. 

397 If this parameter is `True` the values for ``root`` will be 

398 forced into the resulting config if appropriate. 

399 outfile : `lss.resources.ResourcePathExpression`, optional 

400 If not-`None`, the output configuration will be written to this 

401 location rather than into the repository itself. Can be a URI 

402 string. Can refer to a directory that will be used to write 

403 ``butler.yaml``. 

404 overwrite : `bool`, optional 

405 Create a new configuration file even if one already exists 

406 in the specified output location. Default is to raise 

407 an exception. 

408 

409 Returns 

410 ------- 

411 config : `Config` 

412 The updated `Config` instance written to the repo. 

413 

414 Raises 

415 ------ 

416 ValueError 

417 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

418 regular Config (as these subclasses would make it impossible to 

419 support ``standalone=False``). 

420 FileExistsError 

421 Raised if the output config file already exists. 

422 os.error 

423 Raised if the directory does not exist, exists but is not a 

424 directory, or cannot be created. 

425 

426 Notes 

427 ----- 

428 Note that when ``standalone=False`` (the default), the configuration 

429 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

430 construct the repository should also be used to construct any Butlers 

431 to avoid configuration inconsistencies. 

432 """ 

433 if isinstance(config, (ButlerConfig, ConfigSubset)): 

434 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

435 

436 # Ensure that the root of the repository exists or can be made 

437 root_uri = ResourcePath(root, forceDirectory=True) 

438 root_uri.mkdir() 

439 

440 config = Config(config) 

441 

442 # If we are creating a new repo from scratch with relative roots, 

443 # do not propagate an explicit root from the config file 

444 if "root" in config: 

445 del config["root"] 

446 

447 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

448 imported_class = doImportType(full["datastore", "cls"]) 

449 if not issubclass(imported_class, Datastore): 

450 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

451 datastoreClass: Type[Datastore] = imported_class 

452 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

453 

454 # if key exists in given config, parse it, otherwise parse the defaults 

455 # in the expanded config 

456 if config.get(("registry", "db")): 

457 registryConfig = RegistryConfig(config) 

458 else: 

459 registryConfig = RegistryConfig(full) 

460 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

461 if defaultDatabaseUri is not None: 

462 Config.updateParameters( 

463 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

464 ) 

465 else: 

466 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

467 

468 if standalone: 

469 config.merge(full) 

470 else: 

471 # Always expand the registry.managers section into the per-repo 

472 # config, because after the database schema is created, it's not 

473 # allowed to change anymore. Note that in the standalone=True 

474 # branch, _everything_ in the config is expanded, so there's no 

475 # need to special case this. 

476 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

477 configURI: ResourcePathExpression 

478 if outfile is not None: 

479 # When writing to a separate location we must include 

480 # the root of the butler repo in the config else it won't know 

481 # where to look. 

482 config["root"] = root_uri.geturl() 

483 configURI = outfile 

484 else: 

485 configURI = root_uri 

486 # Strip obscore configuration, if it is present, before writing config 

487 # to a file, obscore config will be stored in registry. 

488 config_to_write = config 

489 if ("registry", "managers", "obscore") in config: 

490 config_to_write = config.copy() 

491 del config_to_write["registry", "managers", "obscore", "config"] 

492 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

493 

494 # Create Registry and populate tables 

495 registryConfig = RegistryConfig(config.get("registry")) 

496 dimensionConfig = DimensionConfig(dimensionConfig) 

497 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

498 

499 log.verbose("Wrote new Butler configuration file to %s", configURI) 

500 

501 return config 

502 

503 @classmethod 

504 def _unpickle( 

505 cls, 

506 config: ButlerConfig, 

507 collections: Optional[CollectionSearch], 

508 run: Optional[str], 

509 defaultDataId: Dict[str, str], 

510 writeable: bool, 

511 ) -> Butler: 

512 """Callable used to unpickle a Butler. 

513 

514 We prefer not to use ``Butler.__init__`` directly so we can force some 

515 of its many arguments to be keyword-only (note that ``__reduce__`` 

516 can only invoke callables with positional arguments). 

517 

518 Parameters 

519 ---------- 

520 config : `ButlerConfig` 

521 Butler configuration, already coerced into a true `ButlerConfig` 

522 instance (and hence after any search paths for overrides have been 

523 utilized). 

524 collections : `CollectionSearch` 

525 Names of the default collections to read from. 

526 run : `str`, optional 

527 Name of the default `~CollectionType.RUN` collection to write to. 

528 defaultDataId : `dict` [ `str`, `str` ] 

529 Default data ID values. 

530 writeable : `bool` 

531 Whether the Butler should support write operations. 

532 

533 Returns 

534 ------- 

535 butler : `Butler` 

536 A new `Butler` instance. 

537 """ 

538 # MyPy doesn't recognize that the kwargs below are totally valid; it 

539 # seems to think '**defaultDataId* is a _positional_ argument! 

540 return cls( 

541 config=config, 

542 collections=collections, 

543 run=run, 

544 writeable=writeable, 

545 **defaultDataId, # type: ignore 

546 ) 

547 

548 def __reduce__(self) -> tuple: 

549 """Support pickling.""" 

550 return ( 

551 Butler._unpickle, 

552 ( 

553 self._config, 

554 self.collections, 

555 self.run, 

556 self.registry.defaults.dataId.byName(), 

557 self.registry.isWriteable(), 

558 ), 

559 ) 

560 

561 def __str__(self) -> str: 

562 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

563 self.collections, self.run, self.datastore, self.registry 

564 ) 

565 

566 def isWriteable(self) -> bool: 

567 """Return `True` if this `Butler` supports write operations.""" 

568 return self.registry.isWriteable() 

569 

570 @contextlib.contextmanager 

571 def transaction(self) -> Iterator[None]: 

572 """Context manager supporting `Butler` transactions. 

573 

574 Transactions can be nested. 

575 """ 

576 with self.registry.transaction(): 

577 with self.datastore.transaction(): 

578 yield 

579 

580 def _standardizeArgs( 

581 self, 

582 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

583 dataId: Optional[DataId] = None, 

584 for_put: bool = True, 

585 **kwargs: Any, 

586 ) -> Tuple[DatasetType, Optional[DataId]]: 

587 """Standardize the arguments passed to several Butler APIs. 

588 

589 Parameters 

590 ---------- 

591 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

592 When `DatasetRef` the `dataId` should be `None`. 

593 Otherwise the `DatasetType` or name thereof. 

594 dataId : `dict` or `DataCoordinate` 

595 A `dict` of `Dimension` link name, value pairs that label the 

596 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

597 should be provided as the second argument. 

598 for_put : `bool`, optional 

599 If `True` this call is invoked as part of a `Butler.put()`. 

600 Otherwise it is assumed to be part of a `Butler.get()`. This 

601 parameter is only relevant if there is dataset type 

602 inconsistency. 

603 **kwargs 

604 Additional keyword arguments used to augment or construct a 

605 `DataCoordinate`. See `DataCoordinate.standardize` 

606 parameters. 

607 

608 Returns 

609 ------- 

610 datasetType : `DatasetType` 

611 A `DatasetType` instance extracted from ``datasetRefOrType``. 

612 dataId : `dict` or `DataId`, optional 

613 Argument that can be used (along with ``kwargs``) to construct a 

614 `DataId`. 

615 

616 Notes 

617 ----- 

618 Butler APIs that conceptually need a DatasetRef also allow passing a 

619 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

620 keyword arguments that can be used to construct one) separately. This 

621 method accepts those arguments and always returns a true `DatasetType` 

622 and a `DataId` or `dict`. 

623 

624 Standardization of `dict` vs `DataId` is best handled by passing the 

625 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

626 generally similarly flexible. 

627 """ 

628 externalDatasetType: Optional[DatasetType] = None 

629 internalDatasetType: Optional[DatasetType] = None 

630 if isinstance(datasetRefOrType, DatasetRef): 

631 if dataId is not None or kwargs: 

632 raise ValueError("DatasetRef given, cannot use dataId as well") 

633 externalDatasetType = datasetRefOrType.datasetType 

634 dataId = datasetRefOrType.dataId 

635 else: 

636 # Don't check whether DataId is provided, because Registry APIs 

637 # can usually construct a better error message when it wasn't. 

638 if isinstance(datasetRefOrType, DatasetType): 

639 externalDatasetType = datasetRefOrType 

640 else: 

641 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

642 

643 # Check that they are self-consistent 

644 if externalDatasetType is not None: 

645 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

646 if externalDatasetType != internalDatasetType: 

647 # We can allow differences if they are compatible, depending 

648 # on whether this is a get or a put. A get requires that 

649 # the python type associated with the datastore can be 

650 # converted to the user type. A put requires that the user 

651 # supplied python type can be converted to the internal 

652 # type expected by registry. 

653 relevantDatasetType = internalDatasetType 

654 if for_put: 

655 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

656 else: 

657 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

658 relevantDatasetType = externalDatasetType 

659 if not is_compatible: 

660 raise ValueError( 

661 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

662 f"registry definition ({internalDatasetType})" 

663 ) 

664 # Override the internal definition. 

665 internalDatasetType = relevantDatasetType 

666 

667 assert internalDatasetType is not None 

668 return internalDatasetType, dataId 

669 

670 def _rewrite_data_id( 

671 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

672 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

673 """Rewrite a data ID taking into account dimension records. 

674 

675 Take a Data ID and keyword args and rewrite it if necessary to 

676 allow the user to specify dimension records rather than dimension 

677 primary values. 

678 

679 This allows a user to include a dataId dict with keys of 

680 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

681 the integer exposure ID. It also allows a string to be given 

682 for a dimension value rather than the integer ID if that is more 

683 convenient. For example, rather than having to specifyin the 

684 detector with ``detector.full_name``, a string given for ``detector`` 

685 will be interpreted as the full name and converted to the integer 

686 value. 

687 

688 Keyword arguments can also use strings for dimensions like detector 

689 and exposure but python does not allow them to include ``.`` and 

690 so the ``exposure.day_obs`` syntax can not be used in a keyword 

691 argument. 

692 

693 Parameters 

694 ---------- 

695 dataId : `dict` or `DataCoordinate` 

696 A `dict` of `Dimension` link name, value pairs that will label the 

697 `DatasetRef` within a Collection. 

698 datasetType : `DatasetType` 

699 The dataset type associated with this dataId. Required to 

700 determine the relevant dimensions. 

701 **kwargs 

702 Additional keyword arguments used to augment or construct a 

703 `DataId`. See `DataId` parameters. 

704 

705 Returns 

706 ------- 

707 dataId : `dict` or `DataCoordinate` 

708 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

709 no keyword arguments, the original dataId will be returned 

710 unchanged. 

711 **kwargs : `dict` 

712 Any unused keyword arguments (would normally be empty dict). 

713 """ 

714 # Do nothing if we have a standalone DataCoordinate. 

715 if isinstance(dataId, DataCoordinate) and not kwargs: 

716 return dataId, kwargs 

717 

718 # Process dimension records that are using record information 

719 # rather than ids 

720 newDataId: Dict[str, DataIdValue] = {} 

721 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

722 

723 # if all the dataId comes from keyword parameters we do not need 

724 # to do anything here because they can't be of the form 

725 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

726 if dataId: 

727 for k, v in dataId.items(): 

728 # If we have a Dimension we do not need to do anything 

729 # because it cannot be a compound key. 

730 if isinstance(k, str) and "." in k: 

731 # Someone is using a more human-readable dataId 

732 dimensionName, record = k.split(".", 1) 

733 byRecord[dimensionName][record] = v 

734 elif isinstance(k, Dimension): 

735 newDataId[k.name] = v 

736 else: 

737 newDataId[k] = v 

738 

739 # Go through the updated dataId and check the type in case someone is 

740 # using an alternate key. We have already filtered out the compound 

741 # keys dimensions.record format. 

742 not_dimensions = {} 

743 

744 # Will need to look in the dataId and the keyword arguments 

745 # and will remove them if they need to be fixed or are unrecognized. 

746 for dataIdDict in (newDataId, kwargs): 

747 # Use a list so we can adjust the dict safely in the loop 

748 for dimensionName in list(dataIdDict): 

749 value = dataIdDict[dimensionName] 

750 try: 

751 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

752 except KeyError: 

753 # This is not a real dimension 

754 not_dimensions[dimensionName] = value 

755 del dataIdDict[dimensionName] 

756 continue 

757 

758 # Convert an integral type to an explicit int to simplify 

759 # comparisons here 

760 if isinstance(value, numbers.Integral): 

761 value = int(value) 

762 

763 if not isinstance(value, dimension.primaryKey.getPythonType()): 

764 for alternate in dimension.alternateKeys: 

765 if isinstance(value, alternate.getPythonType()): 

766 byRecord[dimensionName][alternate.name] = value 

767 del dataIdDict[dimensionName] 

768 log.debug( 

769 "Converting dimension %s to %s.%s=%s", 

770 dimensionName, 

771 dimensionName, 

772 alternate.name, 

773 value, 

774 ) 

775 break 

776 else: 

777 log.warning( 

778 "Type mismatch found for value '%r' provided for dimension %s. " 

779 "Could not find matching alternative (primary key has type %s) " 

780 "so attempting to use as-is.", 

781 value, 

782 dimensionName, 

783 dimension.primaryKey.getPythonType(), 

784 ) 

785 

786 # By this point kwargs and newDataId should only include valid 

787 # dimensions. Merge kwargs in to the new dataId and log if there 

788 # are dimensions in both (rather than calling update). 

789 for k, v in kwargs.items(): 

790 if k in newDataId and newDataId[k] != v: 

791 log.debug( 

792 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

793 ) 

794 newDataId[k] = v 

795 # No need to retain any values in kwargs now. 

796 kwargs = {} 

797 

798 # If we have some unrecognized dimensions we have to try to connect 

799 # them to records in other dimensions. This is made more complicated 

800 # by some dimensions having records with clashing names. A mitigation 

801 # is that we can tell by this point which dimensions are missing 

802 # for the DatasetType but this does not work for calibrations 

803 # where additional dimensions can be used to constrain the temporal 

804 # axis. 

805 if not_dimensions: 

806 # Search for all dimensions even if we have been given a value 

807 # explicitly. In some cases records are given as well as the 

808 # actually dimension and this should not be an error if they 

809 # match. 

810 mandatoryDimensions = datasetType.dimensions.names # - provided 

811 

812 candidateDimensions: Set[str] = set() 

813 candidateDimensions.update(mandatoryDimensions) 

814 

815 # For calibrations we may well be needing temporal dimensions 

816 # so rather than always including all dimensions in the scan 

817 # restrict things a little. It is still possible for there 

818 # to be confusion over day_obs in visit vs exposure for example. 

819 # If we are not searching calibration collections things may 

820 # fail but they are going to fail anyway because of the 

821 # ambiguousness of the dataId... 

822 if datasetType.isCalibration(): 

823 for dim in self.registry.dimensions.getStaticDimensions(): 

824 if dim.temporal: 

825 candidateDimensions.add(str(dim)) 

826 

827 # Look up table for the first association with a dimension 

828 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

829 

830 # Keep track of whether an item is associated with multiple 

831 # dimensions. 

832 counter: Counter[str] = Counter() 

833 assigned: Dict[str, Set[str]] = defaultdict(set) 

834 

835 # Go through the missing dimensions and associate the 

836 # given names with records within those dimensions 

837 matched_dims = set() 

838 for dimensionName in candidateDimensions: 

839 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

840 fields = dimension.metadata.names | dimension.uniqueKeys.names 

841 for field in not_dimensions: 

842 if field in fields: 

843 guessedAssociation[dimensionName][field] = not_dimensions[field] 

844 counter[dimensionName] += 1 

845 assigned[field].add(dimensionName) 

846 matched_dims.add(field) 

847 

848 # Calculate the fields that matched nothing. 

849 never_found = set(not_dimensions) - matched_dims 

850 

851 if never_found: 

852 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

853 

854 # There is a chance we have allocated a single dataId item 

855 # to multiple dimensions. Need to decide which should be retained. 

856 # For now assume that the most popular alternative wins. 

857 # This means that day_obs with seq_num will result in 

858 # exposure.day_obs and not visit.day_obs 

859 # Also prefer an explicitly missing dimension over an inferred 

860 # temporal dimension. 

861 for fieldName, assignedDimensions in assigned.items(): 

862 if len(assignedDimensions) > 1: 

863 # Pick the most popular (preferring mandatory dimensions) 

864 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

865 if requiredButMissing: 

866 candidateDimensions = requiredButMissing 

867 else: 

868 candidateDimensions = assignedDimensions 

869 

870 # If this is a choice between visit and exposure and 

871 # neither was a required part of the dataset type, 

872 # (hence in this branch) always prefer exposure over 

873 # visit since exposures are always defined and visits 

874 # are defined from exposures. 

875 if candidateDimensions == {"exposure", "visit"}: 

876 candidateDimensions = {"exposure"} 

877 

878 # Select the relevant items and get a new restricted 

879 # counter. 

880 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

881 duplicatesCounter: Counter[str] = Counter() 

882 duplicatesCounter.update(theseCounts) 

883 

884 # Choose the most common. If they are equally common 

885 # we will pick the one that was found first. 

886 # Returns a list of tuples 

887 selected = duplicatesCounter.most_common(1)[0][0] 

888 

889 log.debug( 

890 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

891 " Removed ambiguity by choosing dimension %s.", 

892 fieldName, 

893 ", ".join(assignedDimensions), 

894 selected, 

895 ) 

896 

897 for candidateDimension in assignedDimensions: 

898 if candidateDimension != selected: 

899 del guessedAssociation[candidateDimension][fieldName] 

900 

901 # Update the record look up dict with the new associations 

902 for dimensionName, values in guessedAssociation.items(): 

903 if values: # A dict might now be empty 

904 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

905 byRecord[dimensionName].update(values) 

906 

907 if byRecord: 

908 # Some record specifiers were found so we need to convert 

909 # them to the Id form 

910 for dimensionName, values in byRecord.items(): 

911 if dimensionName in newDataId: 

912 log.debug( 

913 "DataId specified explicit %s dimension value of %s in addition to" 

914 " general record specifiers for it of %s. Ignoring record information.", 

915 dimensionName, 

916 newDataId[dimensionName], 

917 str(values), 

918 ) 

919 # Get the actual record and compare with these values. 

920 try: 

921 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

922 except DataIdError: 

923 raise ValueError( 

924 f"Could not find dimension '{dimensionName}'" 

925 f" with dataId {newDataId} as part of comparing with" 

926 f" record values {byRecord[dimensionName]}" 

927 ) from None 

928 if len(recs) == 1: 

929 errmsg: List[str] = [] 

930 for k, v in values.items(): 

931 if (recval := getattr(recs[0], k)) != v: 

932 errmsg.append(f"{k}({recval} != {v})") 

933 if errmsg: 

934 raise ValueError( 

935 f"Dimension {dimensionName} in dataId has explicit value" 

936 " inconsistent with records: " + ", ".join(errmsg) 

937 ) 

938 else: 

939 # Multiple matches for an explicit dimension 

940 # should never happen but let downstream complain. 

941 pass 

942 continue 

943 

944 # Build up a WHERE expression 

945 bind = {k: v for k, v in values.items()} 

946 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

947 

948 # Hopefully we get a single record that matches 

949 records = set( 

950 self.registry.queryDimensionRecords( 

951 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

952 ) 

953 ) 

954 

955 if len(records) != 1: 

956 if len(records) > 1: 

957 # visit can have an ambiguous answer without involving 

958 # visit_system. The default visit_system is defined 

959 # by the instrument. 

960 if ( 

961 dimensionName == "visit" 

962 and "visit_system_membership" in self.registry.dimensions 

963 and "visit_system" 

964 in self.registry.dimensions["instrument"].metadata # type: ignore 

965 ): 

966 instrument_records = list( 

967 self.registry.queryDimensionRecords( 

968 "instrument", 

969 dataId=newDataId, 

970 **kwargs, 

971 ) 

972 ) 

973 if len(instrument_records) == 1: 

974 visit_system = instrument_records[0].visit_system 

975 if visit_system is None: 

976 # Set to a value that will never match. 

977 visit_system = -1 

978 

979 # Look up each visit in the 

980 # visit_system_membership records. 

981 for rec in records: 

982 membership = list( 

983 self.registry.queryDimensionRecords( 

984 # Use bind to allow zero results. 

985 # This is a fully-specified query. 

986 "visit_system_membership", 

987 where="instrument = inst AND visit_system = system AND visit = v", 

988 bind=dict( 

989 inst=instrument_records[0].name, system=visit_system, v=rec.id 

990 ), 

991 ) 

992 ) 

993 if membership: 

994 # This record is the right answer. 

995 records = set([rec]) 

996 break 

997 

998 # The ambiguity may have been resolved so check again. 

999 if len(records) > 1: 

1000 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

1001 for r in records: 

1002 log.debug("- %s", str(r)) 

1003 raise ValueError( 

1004 f"DataId specification for dimension {dimensionName} is not" 

1005 f" uniquely constrained to a single dataset by {values}." 

1006 f" Got {len(records)} results." 

1007 ) 

1008 else: 

1009 raise ValueError( 

1010 f"DataId specification for dimension {dimensionName} matched no" 

1011 f" records when constrained by {values}" 

1012 ) 

1013 

1014 # Get the primary key from the real dimension object 

1015 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1016 if not isinstance(dimension, Dimension): 

1017 raise RuntimeError( 

1018 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1019 ) 

1020 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1021 

1022 return newDataId, kwargs 

1023 

1024 def _findDatasetRef( 

1025 self, 

1026 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1027 dataId: Optional[DataId] = None, 

1028 *, 

1029 collections: Any = None, 

1030 allowUnresolved: bool = False, 

1031 **kwargs: Any, 

1032 ) -> DatasetRef: 

1033 """Shared logic for methods that start with a search for a dataset in 

1034 the registry. 

1035 

1036 Parameters 

1037 ---------- 

1038 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1039 When `DatasetRef` the `dataId` should be `None`. 

1040 Otherwise the `DatasetType` or name thereof. 

1041 dataId : `dict` or `DataCoordinate`, optional 

1042 A `dict` of `Dimension` link name, value pairs that label the 

1043 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1044 should be provided as the first argument. 

1045 collections : Any, optional 

1046 Collections to be searched, overriding ``self.collections``. 

1047 Can be any of the types supported by the ``collections`` argument 

1048 to butler construction. 

1049 allowUnresolved : `bool`, optional 

1050 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1051 one in the `Registry` fails. Defaults to `False`. 

1052 **kwargs 

1053 Additional keyword arguments used to augment or construct a 

1054 `DataId`. See `DataId` parameters. 

1055 

1056 Returns 

1057 ------- 

1058 ref : `DatasetRef` 

1059 A reference to the dataset identified by the given arguments. 

1060 

1061 Raises 

1062 ------ 

1063 LookupError 

1064 Raised if no matching dataset exists in the `Registry` (and 

1065 ``allowUnresolved is False``). 

1066 ValueError 

1067 Raised if a resolved `DatasetRef` was passed as an input, but it 

1068 differs from the one found in the registry. 

1069 TypeError 

1070 Raised if no collections were provided. 

1071 """ 

1072 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1073 if isinstance(datasetRefOrType, DatasetRef): 

1074 idNumber = datasetRefOrType.id 

1075 else: 

1076 idNumber = None 

1077 timespan: Optional[Timespan] = None 

1078 

1079 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1080 

1081 if datasetType.isCalibration(): 

1082 # Because this is a calibration dataset, first try to make a 

1083 # standardize the data ID without restricting the dimensions to 

1084 # those of the dataset type requested, because there may be extra 

1085 # dimensions that provide temporal information for a validity-range 

1086 # lookup. 

1087 dataId = DataCoordinate.standardize( 

1088 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1089 ) 

1090 if dataId.graph.temporal: 

1091 dataId = self.registry.expandDataId(dataId) 

1092 timespan = dataId.timespan 

1093 else: 

1094 # Standardize the data ID to just the dimensions of the dataset 

1095 # type instead of letting registry.findDataset do it, so we get the 

1096 # result even if no dataset is found. 

1097 dataId = DataCoordinate.standardize( 

1098 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1099 ) 

1100 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1101 # present in the current collection. 

1102 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1103 if ref is None: 

1104 if allowUnresolved: 

1105 return DatasetRef(datasetType, dataId) 

1106 else: 

1107 if collections is None: 

1108 collections = self.registry.defaults.collections 

1109 raise LookupError( 

1110 f"Dataset {datasetType.name} with data ID {dataId} " 

1111 f"could not be found in collections {collections}." 

1112 ) 

1113 if idNumber is not None and idNumber != ref.id: 

1114 if collections is None: 

1115 collections = self.registry.defaults.collections 

1116 raise ValueError( 

1117 f"DatasetRef.id provided ({idNumber}) does not match " 

1118 f"id ({ref.id}) in registry in collections {collections}." 

1119 ) 

1120 if datasetType != ref.datasetType: 

1121 # If they differ it is because the user explicitly specified 

1122 # a compatible dataset type to this call rather than using the 

1123 # registry definition. The DatasetRef must therefore be recreated 

1124 # using the user definition such that the expected type is 

1125 # returned. 

1126 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1127 

1128 return ref 

1129 

1130 @transactional 

1131 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

1132 # Docstring inherited. 

1133 (imported_ref,) = self.registry._importDatasets( 

1134 [ref], 

1135 expand=True, 

1136 ) 

1137 if imported_ref.id != ref.getCheckedId(): 

1138 raise RuntimeError("This registry configuration does not support putDirect.") 

1139 self.datastore.put(obj, ref) 

1140 return ref 

1141 

1142 @transactional 

1143 def put( 

1144 self, 

1145 obj: Any, 

1146 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1147 dataId: Optional[DataId] = None, 

1148 *, 

1149 run: Optional[str] = None, 

1150 **kwargs: Any, 

1151 ) -> DatasetRef: 

1152 """Store and register a dataset. 

1153 

1154 Parameters 

1155 ---------- 

1156 obj : `object` 

1157 The dataset. 

1158 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1159 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1160 Otherwise the `DatasetType` or name thereof. 

1161 dataId : `dict` or `DataCoordinate` 

1162 A `dict` of `Dimension` link name, value pairs that label the 

1163 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1164 should be provided as the second argument. 

1165 run : `str`, optional 

1166 The name of the run the dataset should be added to, overriding 

1167 ``self.run``. 

1168 **kwargs 

1169 Additional keyword arguments used to augment or construct a 

1170 `DataCoordinate`. See `DataCoordinate.standardize` 

1171 parameters. 

1172 

1173 Returns 

1174 ------- 

1175 ref : `DatasetRef` 

1176 A reference to the stored dataset, updated with the correct id if 

1177 given. 

1178 

1179 Raises 

1180 ------ 

1181 TypeError 

1182 Raised if the butler is read-only or if no run has been provided. 

1183 """ 

1184 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1185 if not self.isWriteable(): 

1186 raise TypeError("Butler is read-only.") 

1187 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1188 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1189 raise ValueError("DatasetRef must not be in registry, must have None id") 

1190 

1191 # Handle dimension records in dataId 

1192 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1193 

1194 # Add Registry Dataset entry. 

1195 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1196 

1197 # For an execution butler the datasets will be pre-defined. 

1198 # If the butler is configured that way datasets should only be inserted 

1199 # if they do not already exist in registry. Trying and catching 

1200 # ConflictingDefinitionError will not work because the transaction 

1201 # will be corrupted. Instead, in this mode always check first. 

1202 ref = None 

1203 ref_is_predefined = False 

1204 if self._allow_put_of_predefined_dataset: 

1205 # Get the matching ref for this run. 

1206 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1207 

1208 if ref: 

1209 # Must be expanded form for datastore templating 

1210 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1211 ref = ref.expanded(dataId) 

1212 ref_is_predefined = True 

1213 

1214 if not ref: 

1215 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1216 

1217 # If the ref is predefined it is possible that the datastore also 

1218 # has the record. Asking datastore to put it again will result in 

1219 # the artifact being recreated, overwriting previous, then will cause 

1220 # a failure in writing the record which will cause the artifact 

1221 # to be removed. Much safer to ask first before attempting to 

1222 # overwrite. Race conditions should not be an issue for the 

1223 # execution butler environment. 

1224 if ref_is_predefined: 

1225 if self.datastore.knows(ref): 

1226 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1227 

1228 self.datastore.put(obj, ref) 

1229 

1230 return ref 

1231 

1232 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1233 """Retrieve a stored dataset. 

1234 

1235 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1236 collection to be read as long as the `DatasetRef` that identifies them 

1237 can be obtained separately. 

1238 

1239 Parameters 

1240 ---------- 

1241 ref : `DatasetRef` 

1242 Resolved reference to an already stored dataset. 

1243 parameters : `dict` 

1244 Additional StorageClass-defined options to control reading, 

1245 typically used to efficiently read only a subset of the dataset. 

1246 

1247 Returns 

1248 ------- 

1249 obj : `object` 

1250 The dataset. 

1251 """ 

1252 return self.datastore.get(ref, parameters=parameters) 

1253 

1254 def getDirectDeferred( 

1255 self, ref: DatasetRef, *, parameters: Union[dict, None] = None 

1256 ) -> DeferredDatasetHandle: 

1257 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1258 from a resolved `DatasetRef`. 

1259 

1260 Parameters 

1261 ---------- 

1262 ref : `DatasetRef` 

1263 Resolved reference to an already stored dataset. 

1264 parameters : `dict` 

1265 Additional StorageClass-defined options to control reading, 

1266 typically used to efficiently read only a subset of the dataset. 

1267 

1268 Returns 

1269 ------- 

1270 obj : `DeferredDatasetHandle` 

1271 A handle which can be used to retrieve a dataset at a later time. 

1272 

1273 Raises 

1274 ------ 

1275 AmbiguousDatasetError 

1276 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1277 """ 

1278 if ref.id is None: 

1279 raise AmbiguousDatasetError( 

1280 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1281 ) 

1282 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1283 

1284 def getDeferred( 

1285 self, 

1286 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1287 dataId: Optional[DataId] = None, 

1288 *, 

1289 parameters: Union[dict, None] = None, 

1290 collections: Any = None, 

1291 **kwargs: Any, 

1292 ) -> DeferredDatasetHandle: 

1293 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1294 after an immediate registry lookup. 

1295 

1296 Parameters 

1297 ---------- 

1298 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1299 When `DatasetRef` the `dataId` should be `None`. 

1300 Otherwise the `DatasetType` or name thereof. 

1301 dataId : `dict` or `DataCoordinate`, optional 

1302 A `dict` of `Dimension` link name, value pairs that label the 

1303 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1304 should be provided as the first argument. 

1305 parameters : `dict` 

1306 Additional StorageClass-defined options to control reading, 

1307 typically used to efficiently read only a subset of the dataset. 

1308 collections : Any, optional 

1309 Collections to be searched, overriding ``self.collections``. 

1310 Can be any of the types supported by the ``collections`` argument 

1311 to butler construction. 

1312 **kwargs 

1313 Additional keyword arguments used to augment or construct a 

1314 `DataId`. See `DataId` parameters. 

1315 

1316 Returns 

1317 ------- 

1318 obj : `DeferredDatasetHandle` 

1319 A handle which can be used to retrieve a dataset at a later time. 

1320 

1321 Raises 

1322 ------ 

1323 LookupError 

1324 Raised if no matching dataset exists in the `Registry` (and 

1325 ``allowUnresolved is False``). 

1326 ValueError 

1327 Raised if a resolved `DatasetRef` was passed as an input, but it 

1328 differs from the one found in the registry. 

1329 TypeError 

1330 Raised if no collections were provided. 

1331 """ 

1332 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1333 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1334 

1335 def get( 

1336 self, 

1337 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1338 dataId: Optional[DataId] = None, 

1339 *, 

1340 parameters: Optional[Dict[str, Any]] = None, 

1341 collections: Any = None, 

1342 **kwargs: Any, 

1343 ) -> Any: 

1344 """Retrieve a stored dataset. 

1345 

1346 Parameters 

1347 ---------- 

1348 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1349 When `DatasetRef` the `dataId` should be `None`. 

1350 Otherwise the `DatasetType` or name thereof. 

1351 dataId : `dict` or `DataCoordinate` 

1352 A `dict` of `Dimension` link name, value pairs that label the 

1353 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1354 should be provided as the first argument. 

1355 parameters : `dict` 

1356 Additional StorageClass-defined options to control reading, 

1357 typically used to efficiently read only a subset of the dataset. 

1358 collections : Any, optional 

1359 Collections to be searched, overriding ``self.collections``. 

1360 Can be any of the types supported by the ``collections`` argument 

1361 to butler construction. 

1362 **kwargs 

1363 Additional keyword arguments used to augment or construct a 

1364 `DataCoordinate`. See `DataCoordinate.standardize` 

1365 parameters. 

1366 

1367 Returns 

1368 ------- 

1369 obj : `object` 

1370 The dataset. 

1371 

1372 Raises 

1373 ------ 

1374 ValueError 

1375 Raised if a resolved `DatasetRef` was passed as an input, but it 

1376 differs from the one found in the registry. 

1377 LookupError 

1378 Raised if no matching dataset exists in the `Registry`. 

1379 TypeError 

1380 Raised if no collections were provided. 

1381 

1382 Notes 

1383 ----- 

1384 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1385 this method requires that the given data ID include temporal dimensions 

1386 beyond the dimensions of the dataset type itself, in order to find the 

1387 dataset with the appropriate validity range. For example, a "bias" 

1388 dataset with native dimensions ``{instrument, detector}`` could be 

1389 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1390 ``exposure`` is a temporal dimension. 

1391 """ 

1392 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1393 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1394 return self.getDirect(ref, parameters=parameters) 

1395 

1396 def getURIs( 

1397 self, 

1398 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1399 dataId: Optional[DataId] = None, 

1400 *, 

1401 predict: bool = False, 

1402 collections: Any = None, 

1403 run: Optional[str] = None, 

1404 **kwargs: Any, 

1405 ) -> DatasetRefURIs: 

1406 """Returns the URIs associated with the dataset. 

1407 

1408 Parameters 

1409 ---------- 

1410 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1411 When `DatasetRef` the `dataId` should be `None`. 

1412 Otherwise the `DatasetType` or name thereof. 

1413 dataId : `dict` or `DataCoordinate` 

1414 A `dict` of `Dimension` link name, value pairs that label the 

1415 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1416 should be provided as the first argument. 

1417 predict : `bool` 

1418 If `True`, allow URIs to be returned of datasets that have not 

1419 been written. 

1420 collections : Any, optional 

1421 Collections to be searched, overriding ``self.collections``. 

1422 Can be any of the types supported by the ``collections`` argument 

1423 to butler construction. 

1424 run : `str`, optional 

1425 Run to use for predictions, overriding ``self.run``. 

1426 **kwargs 

1427 Additional keyword arguments used to augment or construct a 

1428 `DataCoordinate`. See `DataCoordinate.standardize` 

1429 parameters. 

1430 

1431 Returns 

1432 ------- 

1433 uris : `DatasetRefURIs` 

1434 The URI to the primary artifact associated with this dataset (if 

1435 the dataset was disassembled within the datastore this may be 

1436 `None`), and the URIs to any components associated with the dataset 

1437 artifact. (can be empty if there are no components). 

1438 """ 

1439 ref = self._findDatasetRef( 

1440 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1441 ) 

1442 if ref.id is None: # only possible if predict is True 

1443 if run is None: 

1444 run = self.run 

1445 if run is None: 

1446 raise TypeError("Cannot predict location with run=None.") 

1447 # Lie about ID, because we can't guess it, and only 

1448 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1449 ref = ref.resolved(id=0, run=run) 

1450 return self.datastore.getURIs(ref, predict) 

1451 

1452 def getURI( 

1453 self, 

1454 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1455 dataId: Optional[DataId] = None, 

1456 *, 

1457 predict: bool = False, 

1458 collections: Any = None, 

1459 run: Optional[str] = None, 

1460 **kwargs: Any, 

1461 ) -> ResourcePath: 

1462 """Return the URI to the Dataset. 

1463 

1464 Parameters 

1465 ---------- 

1466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1467 When `DatasetRef` the `dataId` should be `None`. 

1468 Otherwise the `DatasetType` or name thereof. 

1469 dataId : `dict` or `DataCoordinate` 

1470 A `dict` of `Dimension` link name, value pairs that label the 

1471 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1472 should be provided as the first argument. 

1473 predict : `bool` 

1474 If `True`, allow URIs to be returned of datasets that have not 

1475 been written. 

1476 collections : Any, optional 

1477 Collections to be searched, overriding ``self.collections``. 

1478 Can be any of the types supported by the ``collections`` argument 

1479 to butler construction. 

1480 run : `str`, optional 

1481 Run to use for predictions, overriding ``self.run``. 

1482 **kwargs 

1483 Additional keyword arguments used to augment or construct a 

1484 `DataCoordinate`. See `DataCoordinate.standardize` 

1485 parameters. 

1486 

1487 Returns 

1488 ------- 

1489 uri : `lsst.resources.ResourcePath` 

1490 URI pointing to the Dataset within the datastore. If the 

1491 Dataset does not exist in the datastore, and if ``predict`` is 

1492 `True`, the URI will be a prediction and will include a URI 

1493 fragment "#predicted". 

1494 If the datastore does not have entities that relate well 

1495 to the concept of a URI the returned URI string will be 

1496 descriptive. The returned URI is not guaranteed to be obtainable. 

1497 

1498 Raises 

1499 ------ 

1500 LookupError 

1501 A URI has been requested for a dataset that does not exist and 

1502 guessing is not allowed. 

1503 ValueError 

1504 Raised if a resolved `DatasetRef` was passed as an input, but it 

1505 differs from the one found in the registry. 

1506 TypeError 

1507 Raised if no collections were provided. 

1508 RuntimeError 

1509 Raised if a URI is requested for a dataset that consists of 

1510 multiple artifacts. 

1511 """ 

1512 primary, components = self.getURIs( 

1513 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1514 ) 

1515 

1516 if primary is None or components: 

1517 raise RuntimeError( 

1518 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1519 "Use Butler.getURIs() instead." 

1520 ) 

1521 return primary 

1522 

1523 def retrieveArtifacts( 

1524 self, 

1525 refs: Iterable[DatasetRef], 

1526 destination: ResourcePathExpression, 

1527 transfer: str = "auto", 

1528 preserve_path: bool = True, 

1529 overwrite: bool = False, 

1530 ) -> List[ResourcePath]: 

1531 """Retrieve the artifacts associated with the supplied refs. 

1532 

1533 Parameters 

1534 ---------- 

1535 refs : iterable of `DatasetRef` 

1536 The datasets for which artifacts are to be retrieved. 

1537 A single ref can result in multiple artifacts. The refs must 

1538 be resolved. 

1539 destination : `lsst.resources.ResourcePath` or `str` 

1540 Location to write the artifacts. 

1541 transfer : `str`, optional 

1542 Method to use to transfer the artifacts. Must be one of the options 

1543 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1544 "move" is not allowed. 

1545 preserve_path : `bool`, optional 

1546 If `True` the full path of the artifact within the datastore 

1547 is preserved. If `False` the final file component of the path 

1548 is used. 

1549 overwrite : `bool`, optional 

1550 If `True` allow transfers to overwrite existing files at the 

1551 destination. 

1552 

1553 Returns 

1554 ------- 

1555 targets : `list` of `lsst.resources.ResourcePath` 

1556 URIs of file artifacts in destination location. Order is not 

1557 preserved. 

1558 

1559 Notes 

1560 ----- 

1561 For non-file datastores the artifacts written to the destination 

1562 may not match the representation inside the datastore. For example 

1563 a hierarchical data structure in a NoSQL database may well be stored 

1564 as a JSON file. 

1565 """ 

1566 return self.datastore.retrieveArtifacts( 

1567 refs, 

1568 ResourcePath(destination), 

1569 transfer=transfer, 

1570 preserve_path=preserve_path, 

1571 overwrite=overwrite, 

1572 ) 

1573 

1574 def datasetExists( 

1575 self, 

1576 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1577 dataId: Optional[DataId] = None, 

1578 *, 

1579 collections: Any = None, 

1580 **kwargs: Any, 

1581 ) -> bool: 

1582 """Return True if the Dataset is actually present in the Datastore. 

1583 

1584 Parameters 

1585 ---------- 

1586 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1587 When `DatasetRef` the `dataId` should be `None`. 

1588 Otherwise the `DatasetType` or name thereof. 

1589 dataId : `dict` or `DataCoordinate` 

1590 A `dict` of `Dimension` link name, value pairs that label the 

1591 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1592 should be provided as the first argument. 

1593 collections : Any, optional 

1594 Collections to be searched, overriding ``self.collections``. 

1595 Can be any of the types supported by the ``collections`` argument 

1596 to butler construction. 

1597 **kwargs 

1598 Additional keyword arguments used to augment or construct a 

1599 `DataCoordinate`. See `DataCoordinate.standardize` 

1600 parameters. 

1601 

1602 Raises 

1603 ------ 

1604 LookupError 

1605 Raised if the dataset is not even present in the Registry. 

1606 ValueError 

1607 Raised if a resolved `DatasetRef` was passed as an input, but it 

1608 differs from the one found in the registry. 

1609 TypeError 

1610 Raised if no collections were provided. 

1611 """ 

1612 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1613 return self.datastore.exists(ref) 

1614 

1615 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1616 """Remove one or more `~CollectionType.RUN` collections and the 

1617 datasets within them. 

1618 

1619 Parameters 

1620 ---------- 

1621 names : `Iterable` [ `str` ] 

1622 The names of the collections to remove. 

1623 unstore : `bool`, optional 

1624 If `True` (default), delete datasets from all datastores in which 

1625 they are present, and attempt to rollback the registry deletions if 

1626 datastore deletions fail (which may not always be possible). If 

1627 `False`, datastore records for these datasets are still removed, 

1628 but any artifacts (e.g. files) will not be. 

1629 

1630 Raises 

1631 ------ 

1632 TypeError 

1633 Raised if one or more collections are not of type 

1634 `~CollectionType.RUN`. 

1635 """ 

1636 if not self.isWriteable(): 

1637 raise TypeError("Butler is read-only.") 

1638 names = list(names) 

1639 refs: List[DatasetRef] = [] 

1640 for name in names: 

1641 collectionType = self.registry.getCollectionType(name) 

1642 if collectionType is not CollectionType.RUN: 

1643 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1644 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1645 with self.datastore.transaction(): 

1646 with self.registry.transaction(): 

1647 if unstore: 

1648 self.datastore.trash(refs) 

1649 else: 

1650 self.datastore.forget(refs) 

1651 for name in names: 

1652 self.registry.removeCollection(name) 

1653 if unstore: 

1654 # Point of no return for removing artifacts 

1655 self.datastore.emptyTrash() 

1656 

1657 def pruneCollection( 

1658 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1659 ) -> None: 

1660 """Remove a collection and possibly prune datasets within it. 

1661 

1662 Parameters 

1663 ---------- 

1664 name : `str` 

1665 Name of the collection to remove. If this is a 

1666 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1667 datasets within the collection are not modified unless ``unstore`` 

1668 is `True`. If this is a `~CollectionType.RUN` collection, 

1669 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1670 are fully removed from the data repository. 

1671 purge : `bool`, optional 

1672 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1673 fully removing datasets within them. Requires ``unstore=True`` as 

1674 well as an added precaution against accidental deletion. Must be 

1675 `False` (default) if the collection is not a ``RUN``. 

1676 unstore: `bool`, optional 

1677 If `True`, remove all datasets in the collection from all 

1678 datastores in which they appear. 

1679 unlink: `list` [`str`], optional 

1680 Before removing the given `collection` unlink it from from these 

1681 parent collections. 

1682 

1683 Raises 

1684 ------ 

1685 TypeError 

1686 Raised if the butler is read-only or arguments are mutually 

1687 inconsistent. 

1688 """ 

1689 # See pruneDatasets comments for more information about the logic here; 

1690 # the cases are almost the same, but here we can rely on Registry to 

1691 # take care everything but Datastore deletion when we remove the 

1692 # collection. 

1693 if not self.isWriteable(): 

1694 raise TypeError("Butler is read-only.") 

1695 collectionType = self.registry.getCollectionType(name) 

1696 if purge and not unstore: 

1697 raise PurgeWithoutUnstorePruneCollectionsError() 

1698 if collectionType is CollectionType.RUN and not purge: 

1699 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1700 if collectionType is not CollectionType.RUN and purge: 

1701 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1702 

1703 def remove(child: str, parent: str) -> None: 

1704 """Remove a child collection from a parent collection.""" 

1705 # Remove child from parent. 

1706 chain = list(self.registry.getCollectionChain(parent)) 

1707 try: 

1708 chain.remove(name) 

1709 except ValueError as e: 

1710 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1711 self.registry.setCollectionChain(parent, chain) 

1712 

1713 with self.datastore.transaction(): 

1714 with self.registry.transaction(): 

1715 if unlink: 

1716 for parent in unlink: 

1717 remove(name, parent) 

1718 if unstore: 

1719 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1720 self.datastore.trash(refs) 

1721 self.registry.removeCollection(name) 

1722 

1723 if unstore: 

1724 # Point of no return for removing artifacts 

1725 self.datastore.emptyTrash() 

1726 

1727 def pruneDatasets( 

1728 self, 

1729 refs: Iterable[DatasetRef], 

1730 *, 

1731 disassociate: bool = True, 

1732 unstore: bool = False, 

1733 tags: Iterable[str] = (), 

1734 purge: bool = False, 

1735 ) -> None: 

1736 # docstring inherited from LimitedButler 

1737 

1738 if not self.isWriteable(): 

1739 raise TypeError("Butler is read-only.") 

1740 if purge: 

1741 if not disassociate: 

1742 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1743 if not unstore: 

1744 raise TypeError("Cannot pass purge=True without unstore=True.") 

1745 elif disassociate: 

1746 tags = tuple(tags) 

1747 if not tags: 

1748 raise TypeError("No tags provided but disassociate=True.") 

1749 for tag in tags: 

1750 collectionType = self.registry.getCollectionType(tag) 

1751 if collectionType is not CollectionType.TAGGED: 

1752 raise TypeError( 

1753 f"Cannot disassociate from collection '{tag}' " 

1754 f"of non-TAGGED type {collectionType.name}." 

1755 ) 

1756 # For an execution butler we want to keep existing UUIDs for the 

1757 # datasets, for that we need to keep them in the collections but 

1758 # remove from datastore. 

1759 if self._allow_put_of_predefined_dataset and purge: 

1760 purge = False 

1761 disassociate = False 

1762 # Transform possibly-single-pass iterable into something we can iterate 

1763 # over multiple times. 

1764 refs = list(refs) 

1765 # Pruning a component of a DatasetRef makes no sense since registry 

1766 # doesn't know about components and datastore might not store 

1767 # components in a separate file 

1768 for ref in refs: 

1769 if ref.datasetType.component(): 

1770 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1771 # We don't need an unreliable Datastore transaction for this, because 

1772 # we've been extra careful to ensure that Datastore.trash only involves 

1773 # mutating the Registry (it can _look_ at Datastore-specific things, 

1774 # but shouldn't change them), and hence all operations here are 

1775 # Registry operations. 

1776 with self.datastore.transaction(): 

1777 with self.registry.transaction(): 

1778 if unstore: 

1779 self.datastore.trash(refs) 

1780 if purge: 

1781 self.registry.removeDatasets(refs) 

1782 elif disassociate: 

1783 assert tags, "Guaranteed by earlier logic in this function." 

1784 for tag in tags: 

1785 self.registry.disassociate(tag, refs) 

1786 # We've exited the Registry transaction, and apparently committed. 

1787 # (if there was an exception, everything rolled back, and it's as if 

1788 # nothing happened - and we never get here). 

1789 # Datastore artifacts are not yet gone, but they're clearly marked 

1790 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1791 # problems we can try again later, and if manual administrative 

1792 # intervention is required, it's pretty clear what that should entail: 

1793 # deleting everything on disk and in private Datastore tables that is 

1794 # in the dataset_location_trash table. 

1795 if unstore: 

1796 # Point of no return for removing artifacts 

1797 self.datastore.emptyTrash() 

1798 

1799 @transactional 

1800 def ingest( 

1801 self, 

1802 *datasets: FileDataset, 

1803 transfer: Optional[str] = "auto", 

1804 run: Optional[str] = None, 

1805 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1806 record_validation_info: bool = True, 

1807 ) -> None: 

1808 """Store and register one or more datasets that already exist on disk. 

1809 

1810 Parameters 

1811 ---------- 

1812 datasets : `FileDataset` 

1813 Each positional argument is a struct containing information about 

1814 a file to be ingested, including its URI (either absolute or 

1815 relative to the datastore root, if applicable), a `DatasetRef`, 

1816 and optionally a formatter class or its fully-qualified string 

1817 name. If a formatter is not provided, the formatter that would be 

1818 used for `put` is assumed. On successful return, all 

1819 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1820 attribute populated and all `FileDataset.formatter` attributes will 

1821 be set to the formatter class used. `FileDataset.path` attributes 

1822 may be modified to put paths in whatever the datastore considers a 

1823 standardized form. 

1824 transfer : `str`, optional 

1825 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1826 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1827 transfer the file. 

1828 run : `str`, optional 

1829 The name of the run ingested datasets should be added to, 

1830 overriding ``self.run``. 

1831 idGenerationMode : `DatasetIdGenEnum`, optional 

1832 Specifies option for generating dataset IDs. By default unique IDs 

1833 are generated for each inserted dataset. 

1834 record_validation_info : `bool`, optional 

1835 If `True`, the default, the datastore can record validation 

1836 information associated with the file. If `False` the datastore 

1837 will not attempt to track any information such as checksums 

1838 or file sizes. This can be useful if such information is tracked 

1839 in an external system or if the file is to be compressed in place. 

1840 It is up to the datastore whether this parameter is relevant. 

1841 

1842 Raises 

1843 ------ 

1844 TypeError 

1845 Raised if the butler is read-only or if no run was provided. 

1846 NotImplementedError 

1847 Raised if the `Datastore` does not support the given transfer mode. 

1848 DatasetTypeNotSupportedError 

1849 Raised if one or more files to be ingested have a dataset type that 

1850 is not supported by the `Datastore`.. 

1851 FileNotFoundError 

1852 Raised if one of the given files does not exist. 

1853 FileExistsError 

1854 Raised if transfer is not `None` but the (internal) location the 

1855 file would be moved to is already occupied. 

1856 

1857 Notes 

1858 ----- 

1859 This operation is not fully exception safe: if a database operation 

1860 fails, the given `FileDataset` instances may be only partially updated. 

1861 

1862 It is atomic in terms of database operations (they will either all 

1863 succeed or all fail) providing the database engine implements 

1864 transactions correctly. It will attempt to be atomic in terms of 

1865 filesystem operations as well, but this cannot be implemented 

1866 rigorously for most datastores. 

1867 """ 

1868 if not self.isWriteable(): 

1869 raise TypeError("Butler is read-only.") 

1870 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1871 # Reorganize the inputs so they're grouped by DatasetType and then 

1872 # data ID. We also include a list of DatasetRefs for each FileDataset 

1873 # to hold the resolved DatasetRefs returned by the Registry, before 

1874 # it's safe to swap them into FileDataset.refs. 

1875 # Some type annotation aliases to make that clearer: 

1876 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1877 GroupedData = MutableMapping[DatasetType, GroupForType] 

1878 # The actual data structure: 

1879 groupedData: GroupedData = defaultdict(dict) 

1880 # And the nested loop that populates it: 

1881 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1882 # This list intentionally shared across the inner loop, since it's 

1883 # associated with `dataset`. 

1884 resolvedRefs: List[DatasetRef] = [] 

1885 

1886 # Somewhere to store pre-existing refs if we have an 

1887 # execution butler. 

1888 existingRefs: List[DatasetRef] = [] 

1889 

1890 for ref in dataset.refs: 

1891 if ref.dataId in groupedData[ref.datasetType]: 

1892 raise ConflictingDefinitionError( 

1893 f"Ingest conflict. Dataset {dataset.path} has same" 

1894 " DataId as other ingest dataset" 

1895 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1896 f" ({ref.dataId})" 

1897 ) 

1898 if self._allow_put_of_predefined_dataset: 

1899 existing_ref = self.registry.findDataset( 

1900 ref.datasetType, dataId=ref.dataId, collections=run 

1901 ) 

1902 if existing_ref: 

1903 if self.datastore.knows(existing_ref): 

1904 raise ConflictingDefinitionError( 

1905 f"Dataset associated with path {dataset.path}" 

1906 f" already exists as {existing_ref}." 

1907 ) 

1908 # Store this ref elsewhere since it already exists 

1909 # and we do not want to remake it but we do want 

1910 # to store it in the datastore. 

1911 existingRefs.append(existing_ref) 

1912 

1913 # Nothing else to do until we have finished 

1914 # iterating. 

1915 continue 

1916 

1917 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1918 

1919 if existingRefs: 

1920 

1921 if len(dataset.refs) != len(existingRefs): 

1922 # Keeping track of partially pre-existing datasets is hard 

1923 # and should generally never happen. For now don't allow 

1924 # it. 

1925 raise ConflictingDefinitionError( 

1926 f"For dataset {dataset.path} some dataIds already exist" 

1927 " in registry but others do not. This is not supported." 

1928 ) 

1929 

1930 # Attach the resolved refs if we found them. 

1931 dataset.refs = existingRefs 

1932 

1933 # Now we can bulk-insert into Registry for each DatasetType. 

1934 for datasetType, groupForType in progress.iter_item_chunks( 

1935 groupedData.items(), desc="Bulk-inserting datasets by type" 

1936 ): 

1937 refs = self.registry.insertDatasets( 

1938 datasetType, 

1939 dataIds=groupForType.keys(), 

1940 run=run, 

1941 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1942 idGenerationMode=idGenerationMode, 

1943 ) 

1944 # Append those resolved DatasetRefs to the new lists we set up for 

1945 # them. 

1946 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1947 resolvedRefs.append(ref) 

1948 

1949 # Go back to the original FileDatasets to replace their refs with the 

1950 # new resolved ones. 

1951 for groupForType in progress.iter_chunks( 

1952 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1953 ): 

1954 for dataset, resolvedRefs in groupForType.values(): 

1955 dataset.refs = resolvedRefs 

1956 

1957 # Bulk-insert everything into Datastore. 

1958 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1959 

1960 @contextlib.contextmanager 

1961 def export( 

1962 self, 

1963 *, 

1964 directory: Optional[str] = None, 

1965 filename: Optional[str] = None, 

1966 format: Optional[str] = None, 

1967 transfer: Optional[str] = None, 

1968 ) -> Iterator[RepoExportContext]: 

1969 """Export datasets from the repository represented by this `Butler`. 

1970 

1971 This method is a context manager that returns a helper object 

1972 (`RepoExportContext`) that is used to indicate what information from 

1973 the repository should be exported. 

1974 

1975 Parameters 

1976 ---------- 

1977 directory : `str`, optional 

1978 Directory dataset files should be written to if ``transfer`` is not 

1979 `None`. 

1980 filename : `str`, optional 

1981 Name for the file that will include database information associated 

1982 with the exported datasets. If this is not an absolute path and 

1983 ``directory`` is not `None`, it will be written to ``directory`` 

1984 instead of the current working directory. Defaults to 

1985 "export.{format}". 

1986 format : `str`, optional 

1987 File format for the database information file. If `None`, the 

1988 extension of ``filename`` will be used. 

1989 transfer : `str`, optional 

1990 Transfer mode passed to `Datastore.export`. 

1991 

1992 Raises 

1993 ------ 

1994 TypeError 

1995 Raised if the set of arguments passed is inconsistent. 

1996 

1997 Examples 

1998 -------- 

1999 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2000 methods are used to provide the iterables over data IDs and/or datasets 

2001 to be exported:: 

2002 

2003 with butler.export("exports.yaml") as export: 

2004 # Export all flats, but none of the dimension element rows 

2005 # (i.e. data ID information) associated with them. 

2006 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2007 elements=()) 

2008 # Export all datasets that start with "deepCoadd_" and all of 

2009 # their associated data ID information. 

2010 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2011 """ 

2012 if directory is None and transfer is not None: 

2013 raise TypeError("Cannot transfer without providing a directory.") 

2014 if transfer == "move": 

2015 raise TypeError("Transfer may not be 'move': export is read-only") 

2016 if format is None: 

2017 if filename is None: 

2018 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2019 else: 

2020 _, format = os.path.splitext(filename) 

2021 elif filename is None: 

2022 filename = f"export.{format}" 

2023 if directory is not None: 

2024 filename = os.path.join(directory, filename) 

2025 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

2026 with open(filename, "w") as stream: 

2027 backend = BackendClass(stream, universe=self.registry.dimensions) 

2028 try: 

2029 helper = RepoExportContext( 

2030 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2031 ) 

2032 yield helper 

2033 except BaseException: 

2034 raise 

2035 else: 

2036 helper._finish() 

2037 

2038 def import_( 

2039 self, 

2040 *, 

2041 directory: Optional[str] = None, 

2042 filename: Union[str, TextIO, None] = None, 

2043 format: Optional[str] = None, 

2044 transfer: Optional[str] = None, 

2045 skip_dimensions: Optional[Set] = None, 

2046 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2047 reuseIds: bool = False, 

2048 ) -> None: 

2049 """Import datasets into this repository that were exported from a 

2050 different butler repository via `~lsst.daf.butler.Butler.export`. 

2051 

2052 Parameters 

2053 ---------- 

2054 directory : `str`, optional 

2055 Directory containing dataset files to import from. If `None`, 

2056 ``filename`` and all dataset file paths specified therein must 

2057 be absolute. 

2058 filename : `str` or `TextIO`, optional 

2059 A stream or name of file that contains database information 

2060 associated with the exported datasets, typically generated by 

2061 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

2062 is not an absolute path, does not exist in the current working 

2063 directory, and ``directory`` is not `None`, it is assumed to be in 

2064 ``directory``. Defaults to "export.{format}". 

2065 format : `str`, optional 

2066 File format for ``filename``. If `None`, the extension of 

2067 ``filename`` will be used. 

2068 transfer : `str`, optional 

2069 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2070 skip_dimensions : `set`, optional 

2071 Names of dimensions that should be skipped and not imported. 

2072 idGenerationMode : `DatasetIdGenEnum`, optional 

2073 Specifies option for generating dataset IDs when IDs are not 

2074 provided or their type does not match backend type. By default 

2075 unique IDs are generated for each inserted dataset. 

2076 reuseIds : `bool`, optional 

2077 If `True` then forces re-use of imported dataset IDs for integer 

2078 IDs which are normally generated as auto-incremented; exception 

2079 will be raised if imported IDs clash with existing ones. This 

2080 option has no effect on the use of globally-unique IDs which are 

2081 always re-used (or generated if integer IDs are being imported). 

2082 

2083 Raises 

2084 ------ 

2085 TypeError 

2086 Raised if the set of arguments passed is inconsistent, or if the 

2087 butler is read-only. 

2088 """ 

2089 if not self.isWriteable(): 

2090 raise TypeError("Butler is read-only.") 

2091 if format is None: 

2092 if filename is None: 

2093 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2094 else: 

2095 _, format = os.path.splitext(filename) # type: ignore 

2096 elif filename is None: 

2097 filename = f"export.{format}" 

2098 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2099 filename = os.path.join(directory, filename) 

2100 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2101 

2102 def doImport(importStream: TextIO) -> None: 

2103 backend = BackendClass(importStream, self.registry) 

2104 backend.register() 

2105 with self.transaction(): 

2106 backend.load( 

2107 self.datastore, 

2108 directory=directory, 

2109 transfer=transfer, 

2110 skip_dimensions=skip_dimensions, 

2111 idGenerationMode=idGenerationMode, 

2112 reuseIds=reuseIds, 

2113 ) 

2114 

2115 if isinstance(filename, str): 

2116 with open(filename, "r") as stream: 

2117 doImport(stream) 

2118 else: 

2119 doImport(filename) 

2120 

2121 def transfer_from( 

2122 self, 

2123 source_butler: Butler, 

2124 source_refs: Iterable[DatasetRef], 

2125 transfer: str = "auto", 

2126 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

2127 skip_missing: bool = True, 

2128 register_dataset_types: bool = False, 

2129 transfer_dimensions: bool = False, 

2130 ) -> List[DatasetRef]: 

2131 """Transfer datasets to this Butler from a run in another Butler. 

2132 

2133 Parameters 

2134 ---------- 

2135 source_butler : `Butler` 

2136 Butler from which the datasets are to be transferred. 

2137 source_refs : iterable of `DatasetRef` 

2138 Datasets defined in the source butler that should be transferred to 

2139 this butler. 

2140 transfer : `str`, optional 

2141 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2142 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2143 A mapping of dataset type to ID generation mode. Only used if 

2144 the source butler is using integer IDs. Should not be used 

2145 if this receiving butler uses integer IDs. Without this dataset 

2146 import always uses unique. 

2147 skip_missing : `bool` 

2148 If `True`, datasets with no datastore artifact associated with 

2149 them are not transferred. If `False` a registry entry will be 

2150 created even if no datastore record is created (and so will 

2151 look equivalent to the dataset being unstored). 

2152 register_dataset_types : `bool` 

2153 If `True` any missing dataset types are registered. Otherwise 

2154 an exception is raised. 

2155 transfer_dimensions : `bool`, optional 

2156 If `True`, dimension record data associated with the new datasets 

2157 will be transferred. 

2158 

2159 Returns 

2160 ------- 

2161 refs : `list` of `DatasetRef` 

2162 The refs added to this Butler. 

2163 

2164 Notes 

2165 ----- 

2166 Requires that any dimension definitions are already present in the 

2167 receiving Butler. The datastore artifact has to exist for a transfer 

2168 to be made but non-existence is not an error. 

2169 

2170 Datasets that already exist in this run will be skipped. 

2171 

2172 The datasets are imported as part of a transaction, although 

2173 dataset types are registered before the transaction is started. 

2174 This means that it is possible for a dataset type to be registered 

2175 even though transfer has failed. 

2176 """ 

2177 if not self.isWriteable(): 

2178 raise TypeError("Butler is read-only.") 

2179 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2180 

2181 # Will iterate through the refs multiple times so need to convert 

2182 # to a list if this isn't a collection. 

2183 if not isinstance(source_refs, collections.abc.Collection): 

2184 source_refs = list(source_refs) 

2185 

2186 original_count = len(source_refs) 

2187 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2188 

2189 if id_gen_map is None: 

2190 id_gen_map = {} 

2191 

2192 # In some situations the datastore artifact may be missing 

2193 # and we do not want that registry entry to be imported. 

2194 # Asking datastore is not sufficient, the records may have been 

2195 # purged, we have to ask for the (predicted) URI and check 

2196 # existence explicitly. Execution butler is set up exactly like 

2197 # this with no datastore records. 

2198 artifact_existence: Dict[ResourcePath, bool] = {} 

2199 if skip_missing: 

2200 dataset_existence = source_butler.datastore.mexists( 

2201 source_refs, artifact_existence=artifact_existence 

2202 ) 

2203 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2204 filtered_count = len(source_refs) 

2205 log.verbose( 

2206 "%d datasets removed because the artifact does not exist. Now have %d.", 

2207 original_count - filtered_count, 

2208 filtered_count, 

2209 ) 

2210 

2211 # Importing requires that we group the refs by dataset type and run 

2212 # before doing the import. 

2213 source_dataset_types = set() 

2214 grouped_refs = defaultdict(list) 

2215 grouped_indices = defaultdict(list) 

2216 for i, ref in enumerate(source_refs): 

2217 grouped_refs[ref.datasetType, ref.run].append(ref) 

2218 grouped_indices[ref.datasetType, ref.run].append(i) 

2219 source_dataset_types.add(ref.datasetType) 

2220 

2221 # Check to see if the dataset type in the source butler has 

2222 # the same definition in the target butler and register missing 

2223 # ones if requested. Registration must happen outside a transaction. 

2224 newly_registered_dataset_types = set() 

2225 for datasetType in source_dataset_types: 

2226 if register_dataset_types: 

2227 # Let this raise immediately if inconsistent. Continuing 

2228 # on to find additional inconsistent dataset types 

2229 # might result in additional unwanted dataset types being 

2230 # registered. 

2231 if self.registry.registerDatasetType(datasetType): 

2232 newly_registered_dataset_types.add(datasetType) 

2233 else: 

2234 # If the dataset type is missing, let it fail immediately. 

2235 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2236 if target_dataset_type != datasetType: 

2237 raise ConflictingDefinitionError( 

2238 "Source butler dataset type differs from definition" 

2239 f" in target butler: {datasetType} !=" 

2240 f" {target_dataset_type}" 

2241 ) 

2242 if newly_registered_dataset_types: 

2243 # We may have registered some even if there were inconsistencies 

2244 # but should let people know (or else remove them again). 

2245 log.log( 

2246 VERBOSE, 

2247 "Registered the following dataset types in the target Butler: %s", 

2248 ", ".join(d.name for d in newly_registered_dataset_types), 

2249 ) 

2250 else: 

2251 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2252 

2253 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2254 if transfer_dimensions: 

2255 # Collect all the dimension records for these refs. 

2256 # All dimensions are to be copied but the list of valid dimensions 

2257 # come from this butler's universe. 

2258 elements = frozenset( 

2259 element 

2260 for element in self.registry.dimensions.getStaticElements() 

2261 if element.hasTable() and element.viewOf is None 

2262 ) 

2263 dataIds = set(ref.dataId for ref in source_refs) 

2264 # This logic comes from saveDataIds. 

2265 for dataId in dataIds: 

2266 # Should be a no-op if the ref has already been expanded. 

2267 dataId = source_butler.registry.expandDataId(dataId) 

2268 # If this butler doesn't know about a dimension in the source 

2269 # butler things will break later. 

2270 for record in dataId.records.values(): 

2271 if record is not None and record.definition in elements: 

2272 dimension_records[record.definition].setdefault(record.dataId, record) 

2273 

2274 # The returned refs should be identical for UUIDs. 

2275 # For now must also support integers and so need to retain the 

2276 # newly-created refs from this registry. 

2277 # Pre-size it so we can assign refs into the correct slots 

2278 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2279 default_id_gen = DatasetIdGenEnum.UNIQUE 

2280 

2281 handled_collections: Set[str] = set() 

2282 

2283 # Do all the importing in a single transaction. 

2284 with self.transaction(): 

2285 if dimension_records: 

2286 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2287 for element, r in dimension_records.items(): 

2288 records = [r[dataId] for dataId in r] 

2289 # Assume that if the record is already present that we can 

2290 # use it without having to check that the record metadata 

2291 # is consistent. 

2292 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2293 

2294 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2295 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2296 ): 

2297 if run not in handled_collections: 

2298 run_doc = source_butler.registry.getCollectionDocumentation(run) 

2299 registered = self.registry.registerRun(run, doc=run_doc) 

2300 handled_collections.add(run) 

2301 if registered: 

2302 log.log(VERBOSE, "Creating output run %s", run) 

2303 

2304 id_generation_mode = default_id_gen 

2305 if isinstance(refs_to_import[0].id, int): 

2306 # ID generation mode might need to be overridden when 

2307 # targetting UUID 

2308 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2309 

2310 n_refs = len(refs_to_import) 

2311 log.verbose( 

2312 "Importing %d ref%s of dataset type %s into run %s", 

2313 n_refs, 

2314 "" if n_refs == 1 else "s", 

2315 datasetType.name, 

2316 run, 

2317 ) 

2318 

2319 # No way to know if this butler's registry uses UUID. 

2320 # We have to trust the caller on this. If it fails they will 

2321 # have to change their approach. We can't catch the exception 

2322 # and retry with unique because that will mess up the 

2323 # transaction handling. We aren't allowed to ask the registry 

2324 # manager what type of ID it is using. 

2325 imported_refs = self.registry._importDatasets( 

2326 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2327 ) 

2328 

2329 # Map them into the correct slots to match the initial order 

2330 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2331 transferred_refs_tmp[i] = ref 

2332 

2333 # Mypy insists that we might have None in here so we have to make 

2334 # that explicit by assigning to a new variable and filtering out 

2335 # something that won't be there. 

2336 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2337 

2338 # Check consistency 

2339 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2340 

2341 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2342 

2343 # The transferred refs need to be reordered to match the original 

2344 # ordering given by the caller. Without this the datastore transfer 

2345 # will be broken. 

2346 

2347 # Ask the datastore to transfer. The datastore has to check that 

2348 # the source datastore is compatible with the target datastore. 

2349 self.datastore.transfer_from( 

2350 source_butler.datastore, 

2351 source_refs, 

2352 local_refs=transferred_refs, 

2353 transfer=transfer, 

2354 artifact_existence=artifact_existence, 

2355 ) 

2356 

2357 return transferred_refs 

2358 

2359 def validateConfiguration( 

2360 self, 

2361 logFailures: bool = False, 

2362 datasetTypeNames: Optional[Iterable[str]] = None, 

2363 ignore: Iterable[str] = None, 

2364 ) -> None: 

2365 """Validate butler configuration. 

2366 

2367 Checks that each `DatasetType` can be stored in the `Datastore`. 

2368 

2369 Parameters 

2370 ---------- 

2371 logFailures : `bool`, optional 

2372 If `True`, output a log message for every validation error 

2373 detected. 

2374 datasetTypeNames : iterable of `str`, optional 

2375 The `DatasetType` names that should be checked. This allows 

2376 only a subset to be selected. 

2377 ignore : iterable of `str`, optional 

2378 Names of DatasetTypes to skip over. This can be used to skip 

2379 known problems. If a named `DatasetType` corresponds to a 

2380 composite, all components of that `DatasetType` will also be 

2381 ignored. 

2382 

2383 Raises 

2384 ------ 

2385 ButlerValidationError 

2386 Raised if there is some inconsistency with how this Butler 

2387 is configured. 

2388 """ 

2389 if datasetTypeNames: 

2390 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2391 else: 

2392 datasetTypes = list(self.registry.queryDatasetTypes()) 

2393 

2394 # filter out anything from the ignore list 

2395 if ignore: 

2396 ignore = set(ignore) 

2397 datasetTypes = [ 

2398 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2399 ] 

2400 else: 

2401 ignore = set() 

2402 

2403 # Find all the registered instruments 

2404 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2405 

2406 # For each datasetType that has an instrument dimension, create 

2407 # a DatasetRef for each defined instrument 

2408 datasetRefs = [] 

2409 

2410 for datasetType in datasetTypes: 

2411 if "instrument" in datasetType.dimensions: 

2412 for instrument in instruments: 

2413 datasetRef = DatasetRef( 

2414 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2415 ) 

2416 datasetRefs.append(datasetRef) 

2417 

2418 entities: List[Union[DatasetType, DatasetRef]] = [] 

2419 entities.extend(datasetTypes) 

2420 entities.extend(datasetRefs) 

2421 

2422 datastoreErrorStr = None 

2423 try: 

2424 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2425 except ValidationError as e: 

2426 datastoreErrorStr = str(e) 

2427 

2428 # Also check that the LookupKeys used by the datastores match 

2429 # registry and storage class definitions 

2430 keys = self.datastore.getLookupKeys() 

2431 

2432 failedNames = set() 

2433 failedDataId = set() 

2434 for key in keys: 

2435 if key.name is not None: 

2436 if key.name in ignore: 

2437 continue 

2438 

2439 # skip if specific datasetType names were requested and this 

2440 # name does not match 

2441 if datasetTypeNames and key.name not in datasetTypeNames: 

2442 continue 

2443 

2444 # See if it is a StorageClass or a DatasetType 

2445 if key.name in self.storageClasses: 

2446 pass 

2447 else: 

2448 try: 

2449 self.registry.getDatasetType(key.name) 

2450 except KeyError: 

2451 if logFailures: 

2452 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2453 failedNames.add(key) 

2454 else: 

2455 # Dimensions are checked for consistency when the Butler 

2456 # is created and rendezvoused with a universe. 

2457 pass 

2458 

2459 # Check that the instrument is a valid instrument 

2460 # Currently only support instrument so check for that 

2461 if key.dataId: 

2462 dataIdKeys = set(key.dataId) 

2463 if set(["instrument"]) != dataIdKeys: 

2464 if logFailures: 

2465 log.critical("Key '%s' has unsupported DataId override", key) 

2466 failedDataId.add(key) 

2467 elif key.dataId["instrument"] not in instruments: 

2468 if logFailures: 

2469 log.critical("Key '%s' has unknown instrument", key) 

2470 failedDataId.add(key) 

2471 

2472 messages = [] 

2473 

2474 if datastoreErrorStr: 

2475 messages.append(datastoreErrorStr) 

2476 

2477 for failed, msg in ( 

2478 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2479 (failedDataId, "Keys with bad DataId entries: "), 

2480 ): 

2481 if failed: 

2482 msg += ", ".join(str(k) for k in failed) 

2483 messages.append(msg) 

2484 

2485 if messages: 

2486 raise ValidationError(";\n".join(messages)) 

2487 

2488 @property 

2489 def collections(self) -> CollectionSearch: 

2490 """The collections to search by default, in order (`CollectionSearch`). 

2491 

2492 This is an alias for ``self.registry.defaults.collections``. It cannot 

2493 be set directly in isolation, but all defaults may be changed together 

2494 by assigning a new `RegistryDefaults` instance to 

2495 ``self.registry.defaults``. 

2496 """ 

2497 return self.registry.defaults.collections 

2498 

2499 @property 

2500 def run(self) -> Optional[str]: 

2501 """Name of the run this butler writes outputs to by default (`str` or 

2502 `None`). 

2503 

2504 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2505 directly in isolation, but all defaults may be changed together by 

2506 assigning a new `RegistryDefaults` instance to 

2507 ``self.registry.defaults``. 

2508 """ 

2509 return self.registry.defaults.run 

2510 

2511 @property 

2512 def dimensions(self) -> DimensionUniverse: 

2513 # Docstring inherited. 

2514 return self.registry.dimensions 

2515 

2516 registry: Registry 

2517 """The object that manages dataset metadata and relationships (`Registry`). 

2518 

2519 Most operations that don't involve reading or writing butler datasets are 

2520 accessible only via `Registry` methods. 

2521 """ 

2522 

2523 datastore: Datastore 

2524 """The object that manages actual dataset storage (`Datastore`). 

2525 

2526 Direct user access to the datastore should rarely be necessary; the primary 

2527 exception is the case where a `Datastore` implementation provides extra 

2528 functionality beyond what the base class defines. 

2529 """ 

2530 

2531 storageClasses: StorageClassFactory 

2532 """An object that maps known storage class names to objects that fully 

2533 describe them (`StorageClassFactory`). 

2534 """ 

2535 

2536 _allow_put_of_predefined_dataset: bool 

2537 """Allow a put to succeed even if there is already a registry entry for it 

2538 but not a datastore record. (`bool`)."""