Coverage for python/lsst/daf/butler/_butler.py: 9%

680 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2022-08-31 04:05 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_class_of 

62from lsst.utils.logging import VERBOSE, getLogger 

63 

64from ._butlerConfig import ButlerConfig 

65from ._butlerRepoIndex import ButlerRepoIndex 

66from ._deferredDatasetHandle import DeferredDatasetHandle 

67from ._limited_butler import LimitedButler 

68from .core import ( 

69 AmbiguousDatasetError, 

70 Config, 

71 ConfigSubset, 

72 DataCoordinate, 

73 DataId, 

74 DataIdValue, 

75 DatasetRef, 

76 DatasetRefURIs, 

77 DatasetType, 

78 Datastore, 

79 Dimension, 

80 DimensionConfig, 

81 DimensionElement, 

82 DimensionRecord, 

83 DimensionUniverse, 

84 FileDataset, 

85 Progress, 

86 StorageClassFactory, 

87 Timespan, 

88 ValidationError, 

89) 

90from .core.repoRelocation import BUTLER_ROOT_TAG 

91from .core.utils import transactional 

92from .registry import ( 

93 CollectionSearch, 

94 CollectionType, 

95 ConflictingDefinitionError, 

96 DataIdError, 

97 DatasetIdGenEnum, 

98 Registry, 

99 RegistryConfig, 

100 RegistryDefaults, 

101) 

102from .transfers import RepoExportContext 

103 

104log = getLogger(__name__) 

105 

106 

107class ButlerValidationError(ValidationError): 

108 """There is a problem with the Butler configuration.""" 

109 

110 pass 

111 

112 

113class PruneCollectionsArgsError(TypeError): 

114 """Base class for errors relating to Butler.pruneCollections input 

115 arguments. 

116 """ 

117 

118 pass 

119 

120 

121class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

122 """Raised when purge and unstore are both required to be True, and 

123 purge is True but unstore is False. 

124 """ 

125 

126 def __init__(self) -> None: 

127 super().__init__("Cannot pass purge=True without unstore=True.") 

128 

129 

130class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

131 """Raised when pruning a RUN collection but purge is False.""" 

132 

133 def __init__(self, collectionType: CollectionType): 

134 self.collectionType = collectionType 

135 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

136 

137 

138class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

139 """Raised when purge is True but is not supported for the given 

140 collection.""" 

141 

142 def __init__(self, collectionType: CollectionType): 

143 self.collectionType = collectionType 

144 super().__init__( 

145 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

146 ) 

147 

148 

149class Butler(LimitedButler): 

150 """Main entry point for the data access system. 

151 

152 Parameters 

153 ---------- 

154 config : `ButlerConfig`, `Config` or `str`, optional. 

155 Configuration. Anything acceptable to the 

156 `ButlerConfig` constructor. If a directory path 

157 is given the configuration will be read from a ``butler.yaml`` file in 

158 that location. If `None` is given default values will be used. 

159 butler : `Butler`, optional. 

160 If provided, construct a new Butler that uses the same registry and 

161 datastore as the given one, but with the given collection and run. 

162 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

163 arguments. 

164 collections : `str` or `Iterable` [ `str` ], optional 

165 An expression specifying the collections to be searched (in order) when 

166 reading datasets. 

167 This may be a `str` collection name or an iterable thereof. 

168 See :ref:`daf_butler_collection_expressions` for more information. 

169 These collections are not registered automatically and must be 

170 manually registered before they are used by any method, but they may be 

171 manually registered after the `Butler` is initialized. 

172 run : `str`, optional 

173 Name of the `~CollectionType.RUN` collection new datasets should be 

174 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

175 ``collections`` will be set to ``[run]``. If not `None`, this 

176 collection will automatically be registered. If this is not set (and 

177 ``writeable`` is not set either), a read-only butler will be created. 

178 searchPaths : `list` of `str`, optional 

179 Directory paths to search when calculating the full Butler 

180 configuration. Not used if the supplied config is already a 

181 `ButlerConfig`. 

182 writeable : `bool`, optional 

183 Explicitly sets whether the butler supports write operations. If not 

184 provided, a read-write butler is created if any of ``run``, ``tags``, 

185 or ``chains`` is non-empty. 

186 inferDefaults : `bool`, optional 

187 If `True` (default) infer default data ID values from the values 

188 present in the datasets in ``collections``: if all collections have the 

189 same value (or no value) for a governor dimension, that value will be 

190 the default for that dimension. Nonexistent collections are ignored. 

191 If a default value is provided explicitly for a governor dimension via 

192 ``**kwargs``, no default will be inferred for that dimension. 

193 **kwargs : `str` 

194 Default data ID key-value pairs. These may only identify "governor" 

195 dimensions like ``instrument`` and ``skymap``. 

196 

197 Examples 

198 -------- 

199 While there are many ways to control exactly how a `Butler` interacts with 

200 the collections in its `Registry`, the most common cases are still simple. 

201 

202 For a read-only `Butler` that searches one collection, do:: 

203 

204 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

205 

206 For a read-write `Butler` that writes to and reads from a 

207 `~CollectionType.RUN` collection:: 

208 

209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

210 

211 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

212 because we want to write to one `~CollectionType.RUN` collection but read 

213 from several others (as well):: 

214 

215 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

216 collections=["u/alice/DM-50000/a", 

217 "u/bob/DM-49998", 

218 "HSC/defaults"]) 

219 

220 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

221 Datasets will be read first from that run (since it appears first in the 

222 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

223 

224 Finally, one can always create a `Butler` with no collections:: 

225 

226 butler = Butler("/path/to/repo", writeable=True) 

227 

228 This can be extremely useful when you just want to use ``butler.registry``, 

229 e.g. for inserting dimension data or managing collections, or when the 

230 collections you want to use with the butler are not consistent. 

231 Passing ``writeable`` explicitly here is only necessary if you want to be 

232 able to make changes to the repo - usually the value for ``writeable`` can 

233 be guessed from the collection arguments provided, but it defaults to 

234 `False` when there are not collection arguments. 

235 """ 

236 

237 def __init__( 

238 self, 

239 config: Union[Config, str, None] = None, 

240 *, 

241 butler: Optional[Butler] = None, 

242 collections: Any = None, 

243 run: Optional[str] = None, 

244 searchPaths: Optional[List[str]] = None, 

245 writeable: Optional[bool] = None, 

246 inferDefaults: bool = True, 

247 **kwargs: str, 

248 ): 

249 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

250 # Load registry, datastore, etc. from config or existing butler. 

251 if butler is not None: 

252 if config is not None or searchPaths is not None or writeable is not None: 

253 raise TypeError( 

254 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

255 ) 

256 self.registry = butler.registry.copy(defaults) 

257 self.datastore = butler.datastore 

258 self.storageClasses = butler.storageClasses 

259 self._config: ButlerConfig = butler._config 

260 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

261 else: 

262 # Can only look for strings in the known repos list. 

263 if isinstance(config, str) and config in self.get_known_repos(): 

264 config = str(self.get_repo_uri(config)) 

265 try: 

266 self._config = ButlerConfig(config, searchPaths=searchPaths) 

267 except FileNotFoundError as e: 

268 if known := self.get_known_repos(): 

269 aliases = f"(known aliases: {', '.join(known)})" 

270 else: 

271 aliases = "(no known aliases)" 

272 raise FileNotFoundError(f"{e} {aliases}") from e 

273 self._config = ButlerConfig(config, searchPaths=searchPaths) 

274 try: 

275 if "root" in self._config: 

276 butlerRoot = self._config["root"] 

277 else: 

278 butlerRoot = self._config.configDir 

279 if writeable is None: 

280 writeable = run is not None 

281 self.registry = Registry.fromConfig( 

282 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

283 ) 

284 self.datastore = Datastore.fromConfig( 

285 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

286 ) 

287 self.storageClasses = StorageClassFactory() 

288 self.storageClasses.addFromConfig(self._config) 

289 self._allow_put_of_predefined_dataset = self._config.get( 

290 "allow_put_of_predefined_dataset", False 

291 ) 

292 except Exception: 

293 # Failures here usually mean that configuration is incomplete, 

294 # just issue an error message which includes config file URI. 

295 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

296 raise 

297 

298 if "run" in self._config or "collection" in self._config: 

299 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

300 

301 GENERATION: ClassVar[int] = 3 

302 """This is a Generation 3 Butler. 

303 

304 This attribute may be removed in the future, once the Generation 2 Butler 

305 interface has been fully retired; it should only be used in transitional 

306 code. 

307 """ 

308 

309 @classmethod 

310 def get_repo_uri(cls, label: str) -> ResourcePath: 

311 """Look up the label in a butler repository index. 

312 

313 Parameters 

314 ---------- 

315 label : `str` 

316 Label of the Butler repository to look up. 

317 

318 Returns 

319 ------- 

320 uri : `lsst.resources.ResourcePath` 

321 URI to the Butler repository associated with the given label. 

322 

323 Raises 

324 ------ 

325 KeyError 

326 Raised if the label is not found in the index, or if an index 

327 can not be found at all. 

328 

329 Notes 

330 ----- 

331 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

332 information is discovered. 

333 """ 

334 return ButlerRepoIndex.get_repo_uri(label) 

335 

336 @classmethod 

337 def get_known_repos(cls) -> Set[str]: 

338 """Retrieve the list of known repository labels. 

339 

340 Returns 

341 ------- 

342 repos : `set` of `str` 

343 All the known labels. Can be empty if no index can be found. 

344 

345 Notes 

346 ----- 

347 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

348 information is discovered. 

349 """ 

350 return ButlerRepoIndex.get_known_repos() 

351 

352 @staticmethod 

353 def makeRepo( 

354 root: ResourcePathExpression, 

355 config: Union[Config, str, None] = None, 

356 dimensionConfig: Union[Config, str, None] = None, 

357 standalone: bool = False, 

358 searchPaths: Optional[List[str]] = None, 

359 forceConfigRoot: bool = True, 

360 outfile: Optional[ResourcePathExpression] = None, 

361 overwrite: bool = False, 

362 ) -> Config: 

363 """Create an empty data repository by adding a butler.yaml config 

364 to a repository root directory. 

365 

366 Parameters 

367 ---------- 

368 root : `lsst.resources.ResourcePathExpression` 

369 Path or URI to the root location of the new repository. Will be 

370 created if it does not exist. 

371 config : `Config` or `str`, optional 

372 Configuration to write to the repository, after setting any 

373 root-dependent Registry or Datastore config options. Can not 

374 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

375 configuration will be used. Root-dependent config options 

376 specified in this config are overwritten if ``forceConfigRoot`` 

377 is `True`. 

378 dimensionConfig : `Config` or `str`, optional 

379 Configuration for dimensions, will be used to initialize registry 

380 database. 

381 standalone : `bool` 

382 If True, write all expanded defaults, not just customized or 

383 repository-specific settings. 

384 This (mostly) decouples the repository from the default 

385 configuration, insulating it from changes to the defaults (which 

386 may be good or bad, depending on the nature of the changes). 

387 Future *additions* to the defaults will still be picked up when 

388 initializing `Butlers` to repos created with ``standalone=True``. 

389 searchPaths : `list` of `str`, optional 

390 Directory paths to search when calculating the full butler 

391 configuration. 

392 forceConfigRoot : `bool`, optional 

393 If `False`, any values present in the supplied ``config`` that 

394 would normally be reset are not overridden and will appear 

395 directly in the output config. This allows non-standard overrides 

396 of the root directory for a datastore or registry to be given. 

397 If this parameter is `True` the values for ``root`` will be 

398 forced into the resulting config if appropriate. 

399 outfile : `lss.resources.ResourcePathExpression`, optional 

400 If not-`None`, the output configuration will be written to this 

401 location rather than into the repository itself. Can be a URI 

402 string. Can refer to a directory that will be used to write 

403 ``butler.yaml``. 

404 overwrite : `bool`, optional 

405 Create a new configuration file even if one already exists 

406 in the specified output location. Default is to raise 

407 an exception. 

408 

409 Returns 

410 ------- 

411 config : `Config` 

412 The updated `Config` instance written to the repo. 

413 

414 Raises 

415 ------ 

416 ValueError 

417 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

418 regular Config (as these subclasses would make it impossible to 

419 support ``standalone=False``). 

420 FileExistsError 

421 Raised if the output config file already exists. 

422 os.error 

423 Raised if the directory does not exist, exists but is not a 

424 directory, or cannot be created. 

425 

426 Notes 

427 ----- 

428 Note that when ``standalone=False`` (the default), the configuration 

429 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

430 construct the repository should also be used to construct any Butlers 

431 to avoid configuration inconsistencies. 

432 """ 

433 if isinstance(config, (ButlerConfig, ConfigSubset)): 

434 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

435 

436 # Ensure that the root of the repository exists or can be made 

437 root_uri = ResourcePath(root, forceDirectory=True) 

438 root_uri.mkdir() 

439 

440 config = Config(config) 

441 

442 # If we are creating a new repo from scratch with relative roots, 

443 # do not propagate an explicit root from the config file 

444 if "root" in config: 

445 del config["root"] 

446 

447 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

448 imported_class = doImportType(full["datastore", "cls"]) 

449 if not issubclass(imported_class, Datastore): 

450 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

451 datastoreClass: Type[Datastore] = imported_class 

452 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

453 

454 # if key exists in given config, parse it, otherwise parse the defaults 

455 # in the expanded config 

456 if config.get(("registry", "db")): 

457 registryConfig = RegistryConfig(config) 

458 else: 

459 registryConfig = RegistryConfig(full) 

460 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

461 if defaultDatabaseUri is not None: 

462 Config.updateParameters( 

463 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

464 ) 

465 else: 

466 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

467 

468 if standalone: 

469 config.merge(full) 

470 else: 

471 # Always expand the registry.managers section into the per-repo 

472 # config, because after the database schema is created, it's not 

473 # allowed to change anymore. Note that in the standalone=True 

474 # branch, _everything_ in the config is expanded, so there's no 

475 # need to special case this. 

476 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

477 configURI: ResourcePathExpression 

478 if outfile is not None: 

479 # When writing to a separate location we must include 

480 # the root of the butler repo in the config else it won't know 

481 # where to look. 

482 config["root"] = root_uri.geturl() 

483 configURI = outfile 

484 else: 

485 configURI = root_uri 

486 config.dumpToUri(configURI, overwrite=overwrite) 

487 

488 # Create Registry and populate tables 

489 registryConfig = RegistryConfig(config.get("registry")) 

490 dimensionConfig = DimensionConfig(dimensionConfig) 

491 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

492 

493 log.verbose("Wrote new Butler configuration file to %s", configURI) 

494 

495 return config 

496 

497 @classmethod 

498 def _unpickle( 

499 cls, 

500 config: ButlerConfig, 

501 collections: Optional[CollectionSearch], 

502 run: Optional[str], 

503 defaultDataId: Dict[str, str], 

504 writeable: bool, 

505 ) -> Butler: 

506 """Callable used to unpickle a Butler. 

507 

508 We prefer not to use ``Butler.__init__`` directly so we can force some 

509 of its many arguments to be keyword-only (note that ``__reduce__`` 

510 can only invoke callables with positional arguments). 

511 

512 Parameters 

513 ---------- 

514 config : `ButlerConfig` 

515 Butler configuration, already coerced into a true `ButlerConfig` 

516 instance (and hence after any search paths for overrides have been 

517 utilized). 

518 collections : `CollectionSearch` 

519 Names of the default collections to read from. 

520 run : `str`, optional 

521 Name of the default `~CollectionType.RUN` collection to write to. 

522 defaultDataId : `dict` [ `str`, `str` ] 

523 Default data ID values. 

524 writeable : `bool` 

525 Whether the Butler should support write operations. 

526 

527 Returns 

528 ------- 

529 butler : `Butler` 

530 A new `Butler` instance. 

531 """ 

532 # MyPy doesn't recognize that the kwargs below are totally valid; it 

533 # seems to think '**defaultDataId* is a _positional_ argument! 

534 return cls( 

535 config=config, 

536 collections=collections, 

537 run=run, 

538 writeable=writeable, 

539 **defaultDataId, # type: ignore 

540 ) 

541 

542 def __reduce__(self) -> tuple: 

543 """Support pickling.""" 

544 return ( 

545 Butler._unpickle, 

546 ( 

547 self._config, 

548 self.collections, 

549 self.run, 

550 self.registry.defaults.dataId.byName(), 

551 self.registry.isWriteable(), 

552 ), 

553 ) 

554 

555 def __str__(self) -> str: 

556 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

557 self.collections, self.run, self.datastore, self.registry 

558 ) 

559 

560 def isWriteable(self) -> bool: 

561 """Return `True` if this `Butler` supports write operations.""" 

562 return self.registry.isWriteable() 

563 

564 @contextlib.contextmanager 

565 def transaction(self) -> Iterator[None]: 

566 """Context manager supporting `Butler` transactions. 

567 

568 Transactions can be nested. 

569 """ 

570 with self.registry.transaction(): 

571 with self.datastore.transaction(): 

572 yield 

573 

574 def _standardizeArgs( 

575 self, 

576 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

577 dataId: Optional[DataId] = None, 

578 for_put: bool = True, 

579 **kwargs: Any, 

580 ) -> Tuple[DatasetType, Optional[DataId]]: 

581 """Standardize the arguments passed to several Butler APIs. 

582 

583 Parameters 

584 ---------- 

585 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

586 When `DatasetRef` the `dataId` should be `None`. 

587 Otherwise the `DatasetType` or name thereof. 

588 dataId : `dict` or `DataCoordinate` 

589 A `dict` of `Dimension` link name, value pairs that label the 

590 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

591 should be provided as the second argument. 

592 for_put : `bool`, optional 

593 If `True` this call is invoked as part of a `Butler.put()`. 

594 Otherwise it is assumed to be part of a `Butler.get()`. This 

595 parameter is only relevant if there is dataset type 

596 inconsistency. 

597 **kwargs 

598 Additional keyword arguments used to augment or construct a 

599 `DataCoordinate`. See `DataCoordinate.standardize` 

600 parameters. 

601 

602 Returns 

603 ------- 

604 datasetType : `DatasetType` 

605 A `DatasetType` instance extracted from ``datasetRefOrType``. 

606 dataId : `dict` or `DataId`, optional 

607 Argument that can be used (along with ``kwargs``) to construct a 

608 `DataId`. 

609 

610 Notes 

611 ----- 

612 Butler APIs that conceptually need a DatasetRef also allow passing a 

613 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

614 keyword arguments that can be used to construct one) separately. This 

615 method accepts those arguments and always returns a true `DatasetType` 

616 and a `DataId` or `dict`. 

617 

618 Standardization of `dict` vs `DataId` is best handled by passing the 

619 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

620 generally similarly flexible. 

621 """ 

622 externalDatasetType: Optional[DatasetType] = None 

623 internalDatasetType: Optional[DatasetType] = None 

624 if isinstance(datasetRefOrType, DatasetRef): 

625 if dataId is not None or kwargs: 

626 raise ValueError("DatasetRef given, cannot use dataId as well") 

627 externalDatasetType = datasetRefOrType.datasetType 

628 dataId = datasetRefOrType.dataId 

629 else: 

630 # Don't check whether DataId is provided, because Registry APIs 

631 # can usually construct a better error message when it wasn't. 

632 if isinstance(datasetRefOrType, DatasetType): 

633 externalDatasetType = datasetRefOrType 

634 else: 

635 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

636 

637 # Check that they are self-consistent 

638 if externalDatasetType is not None: 

639 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

640 if externalDatasetType != internalDatasetType: 

641 # We can allow differences if they are compatible, depending 

642 # on whether this is a get or a put. A get requires that 

643 # the python type associated with the datastore can be 

644 # converted to the user type. A put requires that the user 

645 # supplied python type can be converted to the internal 

646 # type expected by registry. 

647 relevantDatasetType = internalDatasetType 

648 if for_put: 

649 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

650 else: 

651 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

652 relevantDatasetType = externalDatasetType 

653 if not is_compatible: 

654 raise ValueError( 

655 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

656 f"registry definition ({internalDatasetType})" 

657 ) 

658 # Override the internal definition. 

659 internalDatasetType = relevantDatasetType 

660 

661 assert internalDatasetType is not None 

662 return internalDatasetType, dataId 

663 

664 def _rewrite_data_id( 

665 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

666 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

667 """Rewrite a data ID taking into account dimension records. 

668 

669 Take a Data ID and keyword args and rewrite it if necessary to 

670 allow the user to specify dimension records rather than dimension 

671 primary values. 

672 

673 This allows a user to include a dataId dict with keys of 

674 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

675 the integer exposure ID. It also allows a string to be given 

676 for a dimension value rather than the integer ID if that is more 

677 convenient. For example, rather than having to specifyin the 

678 detector with ``detector.full_name``, a string given for ``detector`` 

679 will be interpreted as the full name and converted to the integer 

680 value. 

681 

682 Keyword arguments can also use strings for dimensions like detector 

683 and exposure but python does not allow them to include ``.`` and 

684 so the ``exposure.day_obs`` syntax can not be used in a keyword 

685 argument. 

686 

687 Parameters 

688 ---------- 

689 dataId : `dict` or `DataCoordinate` 

690 A `dict` of `Dimension` link name, value pairs that will label the 

691 `DatasetRef` within a Collection. 

692 datasetType : `DatasetType` 

693 The dataset type associated with this dataId. Required to 

694 determine the relevant dimensions. 

695 **kwargs 

696 Additional keyword arguments used to augment or construct a 

697 `DataId`. See `DataId` parameters. 

698 

699 Returns 

700 ------- 

701 dataId : `dict` or `DataCoordinate` 

702 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

703 no keyword arguments, the original dataId will be returned 

704 unchanged. 

705 **kwargs : `dict` 

706 Any unused keyword arguments (would normally be empty dict). 

707 """ 

708 # Do nothing if we have a standalone DataCoordinate. 

709 if isinstance(dataId, DataCoordinate) and not kwargs: 

710 return dataId, kwargs 

711 

712 # Process dimension records that are using record information 

713 # rather than ids 

714 newDataId: Dict[str, DataIdValue] = {} 

715 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

716 

717 # if all the dataId comes from keyword parameters we do not need 

718 # to do anything here because they can't be of the form 

719 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

720 if dataId: 

721 for k, v in dataId.items(): 

722 # If we have a Dimension we do not need to do anything 

723 # because it cannot be a compound key. 

724 if isinstance(k, str) and "." in k: 

725 # Someone is using a more human-readable dataId 

726 dimensionName, record = k.split(".", 1) 

727 byRecord[dimensionName][record] = v 

728 elif isinstance(k, Dimension): 

729 newDataId[k.name] = v 

730 else: 

731 newDataId[k] = v 

732 

733 # Go through the updated dataId and check the type in case someone is 

734 # using an alternate key. We have already filtered out the compound 

735 # keys dimensions.record format. 

736 not_dimensions = {} 

737 

738 # Will need to look in the dataId and the keyword arguments 

739 # and will remove them if they need to be fixed or are unrecognized. 

740 for dataIdDict in (newDataId, kwargs): 

741 # Use a list so we can adjust the dict safely in the loop 

742 for dimensionName in list(dataIdDict): 

743 value = dataIdDict[dimensionName] 

744 try: 

745 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

746 except KeyError: 

747 # This is not a real dimension 

748 not_dimensions[dimensionName] = value 

749 del dataIdDict[dimensionName] 

750 continue 

751 

752 # Convert an integral type to an explicit int to simplify 

753 # comparisons here 

754 if isinstance(value, numbers.Integral): 

755 value = int(value) 

756 

757 if not isinstance(value, dimension.primaryKey.getPythonType()): 

758 for alternate in dimension.alternateKeys: 

759 if isinstance(value, alternate.getPythonType()): 

760 byRecord[dimensionName][alternate.name] = value 

761 del dataIdDict[dimensionName] 

762 log.debug( 

763 "Converting dimension %s to %s.%s=%s", 

764 dimensionName, 

765 dimensionName, 

766 alternate.name, 

767 value, 

768 ) 

769 break 

770 else: 

771 log.warning( 

772 "Type mismatch found for value '%r' provided for dimension %s. " 

773 "Could not find matching alternative (primary key has type %s) " 

774 "so attempting to use as-is.", 

775 value, 

776 dimensionName, 

777 dimension.primaryKey.getPythonType(), 

778 ) 

779 

780 # By this point kwargs and newDataId should only include valid 

781 # dimensions. Merge kwargs in to the new dataId and log if there 

782 # are dimensions in both (rather than calling update). 

783 for k, v in kwargs.items(): 

784 if k in newDataId and newDataId[k] != v: 

785 log.debug( 

786 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

787 ) 

788 newDataId[k] = v 

789 # No need to retain any values in kwargs now. 

790 kwargs = {} 

791 

792 # If we have some unrecognized dimensions we have to try to connect 

793 # them to records in other dimensions. This is made more complicated 

794 # by some dimensions having records with clashing names. A mitigation 

795 # is that we can tell by this point which dimensions are missing 

796 # for the DatasetType but this does not work for calibrations 

797 # where additional dimensions can be used to constrain the temporal 

798 # axis. 

799 if not_dimensions: 

800 # Search for all dimensions even if we have been given a value 

801 # explicitly. In some cases records are given as well as the 

802 # actually dimension and this should not be an error if they 

803 # match. 

804 mandatoryDimensions = datasetType.dimensions.names # - provided 

805 

806 candidateDimensions: Set[str] = set() 

807 candidateDimensions.update(mandatoryDimensions) 

808 

809 # For calibrations we may well be needing temporal dimensions 

810 # so rather than always including all dimensions in the scan 

811 # restrict things a little. It is still possible for there 

812 # to be confusion over day_obs in visit vs exposure for example. 

813 # If we are not searching calibration collections things may 

814 # fail but they are going to fail anyway because of the 

815 # ambiguousness of the dataId... 

816 if datasetType.isCalibration(): 

817 for dim in self.registry.dimensions.getStaticDimensions(): 

818 if dim.temporal: 

819 candidateDimensions.add(str(dim)) 

820 

821 # Look up table for the first association with a dimension 

822 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

823 

824 # Keep track of whether an item is associated with multiple 

825 # dimensions. 

826 counter: Counter[str] = Counter() 

827 assigned: Dict[str, Set[str]] = defaultdict(set) 

828 

829 # Go through the missing dimensions and associate the 

830 # given names with records within those dimensions 

831 matched_dims = set() 

832 for dimensionName in candidateDimensions: 

833 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

834 fields = dimension.metadata.names | dimension.uniqueKeys.names 

835 for field in not_dimensions: 

836 if field in fields: 

837 guessedAssociation[dimensionName][field] = not_dimensions[field] 

838 counter[dimensionName] += 1 

839 assigned[field].add(dimensionName) 

840 matched_dims.add(field) 

841 

842 # Calculate the fields that matched nothing. 

843 never_found = set(not_dimensions) - matched_dims 

844 

845 if never_found: 

846 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

847 

848 # There is a chance we have allocated a single dataId item 

849 # to multiple dimensions. Need to decide which should be retained. 

850 # For now assume that the most popular alternative wins. 

851 # This means that day_obs with seq_num will result in 

852 # exposure.day_obs and not visit.day_obs 

853 # Also prefer an explicitly missing dimension over an inferred 

854 # temporal dimension. 

855 for fieldName, assignedDimensions in assigned.items(): 

856 if len(assignedDimensions) > 1: 

857 # Pick the most popular (preferring mandatory dimensions) 

858 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

859 if requiredButMissing: 

860 candidateDimensions = requiredButMissing 

861 else: 

862 candidateDimensions = assignedDimensions 

863 

864 # If this is a choice between visit and exposure and 

865 # neither was a required part of the dataset type, 

866 # (hence in this branch) always prefer exposure over 

867 # visit since exposures are always defined and visits 

868 # are defined from exposures. 

869 if candidateDimensions == {"exposure", "visit"}: 

870 candidateDimensions = {"exposure"} 

871 

872 # Select the relevant items and get a new restricted 

873 # counter. 

874 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

875 duplicatesCounter: Counter[str] = Counter() 

876 duplicatesCounter.update(theseCounts) 

877 

878 # Choose the most common. If they are equally common 

879 # we will pick the one that was found first. 

880 # Returns a list of tuples 

881 selected = duplicatesCounter.most_common(1)[0][0] 

882 

883 log.debug( 

884 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

885 " Removed ambiguity by choosing dimension %s.", 

886 fieldName, 

887 ", ".join(assignedDimensions), 

888 selected, 

889 ) 

890 

891 for candidateDimension in assignedDimensions: 

892 if candidateDimension != selected: 

893 del guessedAssociation[candidateDimension][fieldName] 

894 

895 # Update the record look up dict with the new associations 

896 for dimensionName, values in guessedAssociation.items(): 

897 if values: # A dict might now be empty 

898 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

899 byRecord[dimensionName].update(values) 

900 

901 if byRecord: 

902 # Some record specifiers were found so we need to convert 

903 # them to the Id form 

904 for dimensionName, values in byRecord.items(): 

905 if dimensionName in newDataId: 

906 log.debug( 

907 "DataId specified explicit %s dimension value of %s in addition to" 

908 " general record specifiers for it of %s. Ignoring record information.", 

909 dimensionName, 

910 newDataId[dimensionName], 

911 str(values), 

912 ) 

913 # Get the actual record and compare with these values. 

914 try: 

915 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

916 except DataIdError: 

917 raise ValueError( 

918 f"Could not find dimension '{dimensionName}'" 

919 f" with dataId {newDataId} as part of comparing with" 

920 f" record values {byRecord[dimensionName]}" 

921 ) from None 

922 if len(recs) == 1: 

923 errmsg: List[str] = [] 

924 for k, v in values.items(): 

925 if (recval := getattr(recs[0], k)) != v: 

926 errmsg.append(f"{k}({recval} != {v})") 

927 if errmsg: 

928 raise ValueError( 

929 f"Dimension {dimensionName} in dataId has explicit value" 

930 " inconsistent with records: " + ", ".join(errmsg) 

931 ) 

932 else: 

933 # Multiple matches for an explicit dimension 

934 # should never happen but let downstream complain. 

935 pass 

936 continue 

937 

938 # Build up a WHERE expression 

939 bind = {k: v for k, v in values.items()} 

940 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

941 

942 # Hopefully we get a single record that matches 

943 records = set( 

944 self.registry.queryDimensionRecords( 

945 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

946 ) 

947 ) 

948 

949 if len(records) != 1: 

950 if len(records) > 1: 

951 # visit can have an ambiguous answer without involving 

952 # visit_system. The default visit_system is defined 

953 # by the instrument. 

954 if ( 

955 dimensionName == "visit" 

956 and "visit_system_membership" in self.registry.dimensions 

957 and "visit_system" 

958 in self.registry.dimensions["instrument"].metadata # type: ignore 

959 ): 

960 instrument_records = list( 

961 self.registry.queryDimensionRecords( 

962 "instrument", 

963 dataId=newDataId, 

964 **kwargs, 

965 ) 

966 ) 

967 if len(instrument_records) == 1: 

968 visit_system = instrument_records[0].visit_system 

969 if visit_system is None: 

970 # Set to a value that will never match. 

971 visit_system = -1 

972 

973 # Look up each visit in the 

974 # visit_system_membership records. 

975 for rec in records: 

976 membership = list( 

977 self.registry.queryDimensionRecords( 

978 # Use bind to allow zero results. 

979 # This is a fully-specified query. 

980 "visit_system_membership", 

981 where="instrument = inst AND visit_system = system AND visit = v", 

982 bind=dict( 

983 inst=instrument_records[0].name, system=visit_system, v=rec.id 

984 ), 

985 ) 

986 ) 

987 if membership: 

988 # This record is the right answer. 

989 records = set([rec]) 

990 break 

991 

992 # The ambiguity may have been resolved so check again. 

993 if len(records) > 1: 

994 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

995 for r in records: 

996 log.debug("- %s", str(r)) 

997 raise ValueError( 

998 f"DataId specification for dimension {dimensionName} is not" 

999 f" uniquely constrained to a single dataset by {values}." 

1000 f" Got {len(records)} results." 

1001 ) 

1002 else: 

1003 raise ValueError( 

1004 f"DataId specification for dimension {dimensionName} matched no" 

1005 f" records when constrained by {values}" 

1006 ) 

1007 

1008 # Get the primary key from the real dimension object 

1009 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1010 if not isinstance(dimension, Dimension): 

1011 raise RuntimeError( 

1012 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1013 ) 

1014 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1015 

1016 return newDataId, kwargs 

1017 

1018 def _findDatasetRef( 

1019 self, 

1020 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1021 dataId: Optional[DataId] = None, 

1022 *, 

1023 collections: Any = None, 

1024 allowUnresolved: bool = False, 

1025 **kwargs: Any, 

1026 ) -> DatasetRef: 

1027 """Shared logic for methods that start with a search for a dataset in 

1028 the registry. 

1029 

1030 Parameters 

1031 ---------- 

1032 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1033 When `DatasetRef` the `dataId` should be `None`. 

1034 Otherwise the `DatasetType` or name thereof. 

1035 dataId : `dict` or `DataCoordinate`, optional 

1036 A `dict` of `Dimension` link name, value pairs that label the 

1037 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1038 should be provided as the first argument. 

1039 collections : Any, optional 

1040 Collections to be searched, overriding ``self.collections``. 

1041 Can be any of the types supported by the ``collections`` argument 

1042 to butler construction. 

1043 allowUnresolved : `bool`, optional 

1044 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1045 one in the `Registry` fails. Defaults to `False`. 

1046 **kwargs 

1047 Additional keyword arguments used to augment or construct a 

1048 `DataId`. See `DataId` parameters. 

1049 

1050 Returns 

1051 ------- 

1052 ref : `DatasetRef` 

1053 A reference to the dataset identified by the given arguments. 

1054 

1055 Raises 

1056 ------ 

1057 LookupError 

1058 Raised if no matching dataset exists in the `Registry` (and 

1059 ``allowUnresolved is False``). 

1060 ValueError 

1061 Raised if a resolved `DatasetRef` was passed as an input, but it 

1062 differs from the one found in the registry. 

1063 TypeError 

1064 Raised if no collections were provided. 

1065 """ 

1066 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1067 if isinstance(datasetRefOrType, DatasetRef): 

1068 idNumber = datasetRefOrType.id 

1069 else: 

1070 idNumber = None 

1071 timespan: Optional[Timespan] = None 

1072 

1073 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1074 

1075 if datasetType.isCalibration(): 

1076 # Because this is a calibration dataset, first try to make a 

1077 # standardize the data ID without restricting the dimensions to 

1078 # those of the dataset type requested, because there may be extra 

1079 # dimensions that provide temporal information for a validity-range 

1080 # lookup. 

1081 dataId = DataCoordinate.standardize( 

1082 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1083 ) 

1084 if dataId.graph.temporal: 

1085 dataId = self.registry.expandDataId(dataId) 

1086 timespan = dataId.timespan 

1087 else: 

1088 # Standardize the data ID to just the dimensions of the dataset 

1089 # type instead of letting registry.findDataset do it, so we get the 

1090 # result even if no dataset is found. 

1091 dataId = DataCoordinate.standardize( 

1092 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1093 ) 

1094 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1095 # present in the current collection. 

1096 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1097 if ref is None: 

1098 if allowUnresolved: 

1099 return DatasetRef(datasetType, dataId) 

1100 else: 

1101 if collections is None: 

1102 collections = self.registry.defaults.collections 

1103 raise LookupError( 

1104 f"Dataset {datasetType.name} with data ID {dataId} " 

1105 f"could not be found in collections {collections}." 

1106 ) 

1107 if idNumber is not None and idNumber != ref.id: 

1108 if collections is None: 

1109 collections = self.registry.defaults.collections 

1110 raise ValueError( 

1111 f"DatasetRef.id provided ({idNumber}) does not match " 

1112 f"id ({ref.id}) in registry in collections {collections}." 

1113 ) 

1114 if datasetType != ref.datasetType: 

1115 # If they differ it is because the user explicitly specified 

1116 # a compatible dataset type to this call rather than using the 

1117 # registry definition. The DatasetRef must therefore be recreated 

1118 # using the user definition such that the expected type is 

1119 # returned. 

1120 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1121 

1122 return ref 

1123 

1124 @transactional 

1125 def putDirect(self, obj: Any, ref: DatasetRef) -> DatasetRef: 

1126 # Docstring inherited. 

1127 (imported_ref,) = self.registry._importDatasets( 

1128 [ref], 

1129 expand=True, 

1130 ) 

1131 if imported_ref.id != ref.getCheckedId(): 

1132 raise RuntimeError("This registry configuration does not support putDirect.") 

1133 self.datastore.put(obj, ref) 

1134 return ref 

1135 

1136 @transactional 

1137 def put( 

1138 self, 

1139 obj: Any, 

1140 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1141 dataId: Optional[DataId] = None, 

1142 *, 

1143 run: Optional[str] = None, 

1144 **kwargs: Any, 

1145 ) -> DatasetRef: 

1146 """Store and register a dataset. 

1147 

1148 Parameters 

1149 ---------- 

1150 obj : `object` 

1151 The dataset. 

1152 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1153 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1154 Otherwise the `DatasetType` or name thereof. 

1155 dataId : `dict` or `DataCoordinate` 

1156 A `dict` of `Dimension` link name, value pairs that label the 

1157 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1158 should be provided as the second argument. 

1159 run : `str`, optional 

1160 The name of the run the dataset should be added to, overriding 

1161 ``self.run``. 

1162 **kwargs 

1163 Additional keyword arguments used to augment or construct a 

1164 `DataCoordinate`. See `DataCoordinate.standardize` 

1165 parameters. 

1166 

1167 Returns 

1168 ------- 

1169 ref : `DatasetRef` 

1170 A reference to the stored dataset, updated with the correct id if 

1171 given. 

1172 

1173 Raises 

1174 ------ 

1175 TypeError 

1176 Raised if the butler is read-only or if no run has been provided. 

1177 """ 

1178 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1179 if not self.isWriteable(): 

1180 raise TypeError("Butler is read-only.") 

1181 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1182 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1183 raise ValueError("DatasetRef must not be in registry, must have None id") 

1184 

1185 # Handle dimension records in dataId 

1186 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1187 

1188 # Add Registry Dataset entry. 

1189 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1190 

1191 # For an execution butler the datasets will be pre-defined. 

1192 # If the butler is configured that way datasets should only be inserted 

1193 # if they do not already exist in registry. Trying and catching 

1194 # ConflictingDefinitionError will not work because the transaction 

1195 # will be corrupted. Instead, in this mode always check first. 

1196 ref = None 

1197 ref_is_predefined = False 

1198 if self._allow_put_of_predefined_dataset: 

1199 # Get the matching ref for this run. 

1200 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1201 

1202 if ref: 

1203 # Must be expanded form for datastore templating 

1204 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1205 ref = ref.expanded(dataId) 

1206 ref_is_predefined = True 

1207 

1208 if not ref: 

1209 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1210 

1211 # If the ref is predefined it is possible that the datastore also 

1212 # has the record. Asking datastore to put it again will result in 

1213 # the artifact being recreated, overwriting previous, then will cause 

1214 # a failure in writing the record which will cause the artifact 

1215 # to be removed. Much safer to ask first before attempting to 

1216 # overwrite. Race conditions should not be an issue for the 

1217 # execution butler environment. 

1218 if ref_is_predefined: 

1219 if self.datastore.knows(ref): 

1220 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1221 

1222 self.datastore.put(obj, ref) 

1223 

1224 return ref 

1225 

1226 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1227 """Retrieve a stored dataset. 

1228 

1229 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1230 collection to be read as long as the `DatasetRef` that identifies them 

1231 can be obtained separately. 

1232 

1233 Parameters 

1234 ---------- 

1235 ref : `DatasetRef` 

1236 Resolved reference to an already stored dataset. 

1237 parameters : `dict` 

1238 Additional StorageClass-defined options to control reading, 

1239 typically used to efficiently read only a subset of the dataset. 

1240 

1241 Returns 

1242 ------- 

1243 obj : `object` 

1244 The dataset. 

1245 """ 

1246 return self.datastore.get(ref, parameters=parameters) 

1247 

1248 def getDirectDeferred( 

1249 self, ref: DatasetRef, *, parameters: Union[dict, None] = None 

1250 ) -> DeferredDatasetHandle: 

1251 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1252 from a resolved `DatasetRef`. 

1253 

1254 Parameters 

1255 ---------- 

1256 ref : `DatasetRef` 

1257 Resolved reference to an already stored dataset. 

1258 parameters : `dict` 

1259 Additional StorageClass-defined options to control reading, 

1260 typically used to efficiently read only a subset of the dataset. 

1261 

1262 Returns 

1263 ------- 

1264 obj : `DeferredDatasetHandle` 

1265 A handle which can be used to retrieve a dataset at a later time. 

1266 

1267 Raises 

1268 ------ 

1269 AmbiguousDatasetError 

1270 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1271 """ 

1272 if ref.id is None: 

1273 raise AmbiguousDatasetError( 

1274 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1275 ) 

1276 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1277 

1278 def getDeferred( 

1279 self, 

1280 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1281 dataId: Optional[DataId] = None, 

1282 *, 

1283 parameters: Union[dict, None] = None, 

1284 collections: Any = None, 

1285 **kwargs: Any, 

1286 ) -> DeferredDatasetHandle: 

1287 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1288 after an immediate registry lookup. 

1289 

1290 Parameters 

1291 ---------- 

1292 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1293 When `DatasetRef` the `dataId` should be `None`. 

1294 Otherwise the `DatasetType` or name thereof. 

1295 dataId : `dict` or `DataCoordinate`, optional 

1296 A `dict` of `Dimension` link name, value pairs that label the 

1297 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1298 should be provided as the first argument. 

1299 parameters : `dict` 

1300 Additional StorageClass-defined options to control reading, 

1301 typically used to efficiently read only a subset of the dataset. 

1302 collections : Any, optional 

1303 Collections to be searched, overriding ``self.collections``. 

1304 Can be any of the types supported by the ``collections`` argument 

1305 to butler construction. 

1306 **kwargs 

1307 Additional keyword arguments used to augment or construct a 

1308 `DataId`. See `DataId` parameters. 

1309 

1310 Returns 

1311 ------- 

1312 obj : `DeferredDatasetHandle` 

1313 A handle which can be used to retrieve a dataset at a later time. 

1314 

1315 Raises 

1316 ------ 

1317 LookupError 

1318 Raised if no matching dataset exists in the `Registry` (and 

1319 ``allowUnresolved is False``). 

1320 ValueError 

1321 Raised if a resolved `DatasetRef` was passed as an input, but it 

1322 differs from the one found in the registry. 

1323 TypeError 

1324 Raised if no collections were provided. 

1325 """ 

1326 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1327 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1328 

1329 def get( 

1330 self, 

1331 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1332 dataId: Optional[DataId] = None, 

1333 *, 

1334 parameters: Optional[Dict[str, Any]] = None, 

1335 collections: Any = None, 

1336 **kwargs: Any, 

1337 ) -> Any: 

1338 """Retrieve a stored dataset. 

1339 

1340 Parameters 

1341 ---------- 

1342 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1343 When `DatasetRef` the `dataId` should be `None`. 

1344 Otherwise the `DatasetType` or name thereof. 

1345 dataId : `dict` or `DataCoordinate` 

1346 A `dict` of `Dimension` link name, value pairs that label the 

1347 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1348 should be provided as the first argument. 

1349 parameters : `dict` 

1350 Additional StorageClass-defined options to control reading, 

1351 typically used to efficiently read only a subset of the dataset. 

1352 collections : Any, optional 

1353 Collections to be searched, overriding ``self.collections``. 

1354 Can be any of the types supported by the ``collections`` argument 

1355 to butler construction. 

1356 **kwargs 

1357 Additional keyword arguments used to augment or construct a 

1358 `DataCoordinate`. See `DataCoordinate.standardize` 

1359 parameters. 

1360 

1361 Returns 

1362 ------- 

1363 obj : `object` 

1364 The dataset. 

1365 

1366 Raises 

1367 ------ 

1368 ValueError 

1369 Raised if a resolved `DatasetRef` was passed as an input, but it 

1370 differs from the one found in the registry. 

1371 LookupError 

1372 Raised if no matching dataset exists in the `Registry`. 

1373 TypeError 

1374 Raised if no collections were provided. 

1375 

1376 Notes 

1377 ----- 

1378 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1379 this method requires that the given data ID include temporal dimensions 

1380 beyond the dimensions of the dataset type itself, in order to find the 

1381 dataset with the appropriate validity range. For example, a "bias" 

1382 dataset with native dimensions ``{instrument, detector}`` could be 

1383 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1384 ``exposure`` is a temporal dimension. 

1385 """ 

1386 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1387 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1388 return self.getDirect(ref, parameters=parameters) 

1389 

1390 def getURIs( 

1391 self, 

1392 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1393 dataId: Optional[DataId] = None, 

1394 *, 

1395 predict: bool = False, 

1396 collections: Any = None, 

1397 run: Optional[str] = None, 

1398 **kwargs: Any, 

1399 ) -> DatasetRefURIs: 

1400 """Returns the URIs associated with the dataset. 

1401 

1402 Parameters 

1403 ---------- 

1404 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1405 When `DatasetRef` the `dataId` should be `None`. 

1406 Otherwise the `DatasetType` or name thereof. 

1407 dataId : `dict` or `DataCoordinate` 

1408 A `dict` of `Dimension` link name, value pairs that label the 

1409 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1410 should be provided as the first argument. 

1411 predict : `bool` 

1412 If `True`, allow URIs to be returned of datasets that have not 

1413 been written. 

1414 collections : Any, optional 

1415 Collections to be searched, overriding ``self.collections``. 

1416 Can be any of the types supported by the ``collections`` argument 

1417 to butler construction. 

1418 run : `str`, optional 

1419 Run to use for predictions, overriding ``self.run``. 

1420 **kwargs 

1421 Additional keyword arguments used to augment or construct a 

1422 `DataCoordinate`. See `DataCoordinate.standardize` 

1423 parameters. 

1424 

1425 Returns 

1426 ------- 

1427 uris : `DatasetRefURIs` 

1428 The URI to the primary artifact associated with this dataset (if 

1429 the dataset was disassembled within the datastore this may be 

1430 `None`), and the URIs to any components associated with the dataset 

1431 artifact. (can be empty if there are no components). 

1432 """ 

1433 ref = self._findDatasetRef( 

1434 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1435 ) 

1436 if ref.id is None: # only possible if predict is True 

1437 if run is None: 

1438 run = self.run 

1439 if run is None: 

1440 raise TypeError("Cannot predict location with run=None.") 

1441 # Lie about ID, because we can't guess it, and only 

1442 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1443 ref = ref.resolved(id=0, run=run) 

1444 return self.datastore.getURIs(ref, predict) 

1445 

1446 def getURI( 

1447 self, 

1448 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1449 dataId: Optional[DataId] = None, 

1450 *, 

1451 predict: bool = False, 

1452 collections: Any = None, 

1453 run: Optional[str] = None, 

1454 **kwargs: Any, 

1455 ) -> ResourcePath: 

1456 """Return the URI to the Dataset. 

1457 

1458 Parameters 

1459 ---------- 

1460 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1461 When `DatasetRef` the `dataId` should be `None`. 

1462 Otherwise the `DatasetType` or name thereof. 

1463 dataId : `dict` or `DataCoordinate` 

1464 A `dict` of `Dimension` link name, value pairs that label the 

1465 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1466 should be provided as the first argument. 

1467 predict : `bool` 

1468 If `True`, allow URIs to be returned of datasets that have not 

1469 been written. 

1470 collections : Any, optional 

1471 Collections to be searched, overriding ``self.collections``. 

1472 Can be any of the types supported by the ``collections`` argument 

1473 to butler construction. 

1474 run : `str`, optional 

1475 Run to use for predictions, overriding ``self.run``. 

1476 **kwargs 

1477 Additional keyword arguments used to augment or construct a 

1478 `DataCoordinate`. See `DataCoordinate.standardize` 

1479 parameters. 

1480 

1481 Returns 

1482 ------- 

1483 uri : `lsst.resources.ResourcePath` 

1484 URI pointing to the Dataset within the datastore. If the 

1485 Dataset does not exist in the datastore, and if ``predict`` is 

1486 `True`, the URI will be a prediction and will include a URI 

1487 fragment "#predicted". 

1488 If the datastore does not have entities that relate well 

1489 to the concept of a URI the returned URI string will be 

1490 descriptive. The returned URI is not guaranteed to be obtainable. 

1491 

1492 Raises 

1493 ------ 

1494 LookupError 

1495 A URI has been requested for a dataset that does not exist and 

1496 guessing is not allowed. 

1497 ValueError 

1498 Raised if a resolved `DatasetRef` was passed as an input, but it 

1499 differs from the one found in the registry. 

1500 TypeError 

1501 Raised if no collections were provided. 

1502 RuntimeError 

1503 Raised if a URI is requested for a dataset that consists of 

1504 multiple artifacts. 

1505 """ 

1506 primary, components = self.getURIs( 

1507 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1508 ) 

1509 

1510 if primary is None or components: 

1511 raise RuntimeError( 

1512 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1513 "Use Butler.getURIs() instead." 

1514 ) 

1515 return primary 

1516 

1517 def retrieveArtifacts( 

1518 self, 

1519 refs: Iterable[DatasetRef], 

1520 destination: ResourcePathExpression, 

1521 transfer: str = "auto", 

1522 preserve_path: bool = True, 

1523 overwrite: bool = False, 

1524 ) -> List[ResourcePath]: 

1525 """Retrieve the artifacts associated with the supplied refs. 

1526 

1527 Parameters 

1528 ---------- 

1529 refs : iterable of `DatasetRef` 

1530 The datasets for which artifacts are to be retrieved. 

1531 A single ref can result in multiple artifacts. The refs must 

1532 be resolved. 

1533 destination : `lsst.resources.ResourcePath` or `str` 

1534 Location to write the artifacts. 

1535 transfer : `str`, optional 

1536 Method to use to transfer the artifacts. Must be one of the options 

1537 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1538 "move" is not allowed. 

1539 preserve_path : `bool`, optional 

1540 If `True` the full path of the artifact within the datastore 

1541 is preserved. If `False` the final file component of the path 

1542 is used. 

1543 overwrite : `bool`, optional 

1544 If `True` allow transfers to overwrite existing files at the 

1545 destination. 

1546 

1547 Returns 

1548 ------- 

1549 targets : `list` of `lsst.resources.ResourcePath` 

1550 URIs of file artifacts in destination location. Order is not 

1551 preserved. 

1552 

1553 Notes 

1554 ----- 

1555 For non-file datastores the artifacts written to the destination 

1556 may not match the representation inside the datastore. For example 

1557 a hierarchical data structure in a NoSQL database may well be stored 

1558 as a JSON file. 

1559 """ 

1560 return self.datastore.retrieveArtifacts( 

1561 refs, 

1562 ResourcePath(destination), 

1563 transfer=transfer, 

1564 preserve_path=preserve_path, 

1565 overwrite=overwrite, 

1566 ) 

1567 

1568 def datasetExists( 

1569 self, 

1570 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1571 dataId: Optional[DataId] = None, 

1572 *, 

1573 collections: Any = None, 

1574 **kwargs: Any, 

1575 ) -> bool: 

1576 """Return True if the Dataset is actually present in the Datastore. 

1577 

1578 Parameters 

1579 ---------- 

1580 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1581 When `DatasetRef` the `dataId` should be `None`. 

1582 Otherwise the `DatasetType` or name thereof. 

1583 dataId : `dict` or `DataCoordinate` 

1584 A `dict` of `Dimension` link name, value pairs that label the 

1585 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1586 should be provided as the first argument. 

1587 collections : Any, optional 

1588 Collections to be searched, overriding ``self.collections``. 

1589 Can be any of the types supported by the ``collections`` argument 

1590 to butler construction. 

1591 **kwargs 

1592 Additional keyword arguments used to augment or construct a 

1593 `DataCoordinate`. See `DataCoordinate.standardize` 

1594 parameters. 

1595 

1596 Raises 

1597 ------ 

1598 LookupError 

1599 Raised if the dataset is not even present in the Registry. 

1600 ValueError 

1601 Raised if a resolved `DatasetRef` was passed as an input, but it 

1602 differs from the one found in the registry. 

1603 TypeError 

1604 Raised if no collections were provided. 

1605 """ 

1606 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1607 return self.datastore.exists(ref) 

1608 

1609 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1610 """Remove one or more `~CollectionType.RUN` collections and the 

1611 datasets within them. 

1612 

1613 Parameters 

1614 ---------- 

1615 names : `Iterable` [ `str` ] 

1616 The names of the collections to remove. 

1617 unstore : `bool`, optional 

1618 If `True` (default), delete datasets from all datastores in which 

1619 they are present, and attempt to rollback the registry deletions if 

1620 datastore deletions fail (which may not always be possible). If 

1621 `False`, datastore records for these datasets are still removed, 

1622 but any artifacts (e.g. files) will not be. 

1623 

1624 Raises 

1625 ------ 

1626 TypeError 

1627 Raised if one or more collections are not of type 

1628 `~CollectionType.RUN`. 

1629 """ 

1630 if not self.isWriteable(): 

1631 raise TypeError("Butler is read-only.") 

1632 names = list(names) 

1633 refs: List[DatasetRef] = [] 

1634 for name in names: 

1635 collectionType = self.registry.getCollectionType(name) 

1636 if collectionType is not CollectionType.RUN: 

1637 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1638 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1639 with self.registry.transaction(): 

1640 if unstore: 

1641 self.datastore.trash(refs) 

1642 else: 

1643 self.datastore.forget(refs) 

1644 for name in names: 

1645 self.registry.removeCollection(name) 

1646 if unstore: 

1647 # Point of no return for removing artifacts 

1648 self.datastore.emptyTrash() 

1649 

1650 def pruneCollection( 

1651 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1652 ) -> None: 

1653 """Remove a collection and possibly prune datasets within it. 

1654 

1655 Parameters 

1656 ---------- 

1657 name : `str` 

1658 Name of the collection to remove. If this is a 

1659 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1660 datasets within the collection are not modified unless ``unstore`` 

1661 is `True`. If this is a `~CollectionType.RUN` collection, 

1662 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1663 are fully removed from the data repository. 

1664 purge : `bool`, optional 

1665 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1666 fully removing datasets within them. Requires ``unstore=True`` as 

1667 well as an added precaution against accidental deletion. Must be 

1668 `False` (default) if the collection is not a ``RUN``. 

1669 unstore: `bool`, optional 

1670 If `True`, remove all datasets in the collection from all 

1671 datastores in which they appear. 

1672 unlink: `list` [`str`], optional 

1673 Before removing the given `collection` unlink it from from these 

1674 parent collections. 

1675 

1676 Raises 

1677 ------ 

1678 TypeError 

1679 Raised if the butler is read-only or arguments are mutually 

1680 inconsistent. 

1681 """ 

1682 # See pruneDatasets comments for more information about the logic here; 

1683 # the cases are almost the same, but here we can rely on Registry to 

1684 # take care everything but Datastore deletion when we remove the 

1685 # collection. 

1686 if not self.isWriteable(): 

1687 raise TypeError("Butler is read-only.") 

1688 collectionType = self.registry.getCollectionType(name) 

1689 if purge and not unstore: 

1690 raise PurgeWithoutUnstorePruneCollectionsError() 

1691 if collectionType is CollectionType.RUN and not purge: 

1692 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1693 if collectionType is not CollectionType.RUN and purge: 

1694 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1695 

1696 def remove(child: str, parent: str) -> None: 

1697 """Remove a child collection from a parent collection.""" 

1698 # Remove child from parent. 

1699 chain = list(self.registry.getCollectionChain(parent)) 

1700 try: 

1701 chain.remove(name) 

1702 except ValueError as e: 

1703 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1704 self.registry.setCollectionChain(parent, chain) 

1705 

1706 with self.registry.transaction(): 

1707 if unlink: 

1708 for parent in unlink: 

1709 remove(name, parent) 

1710 if unstore: 

1711 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1712 self.datastore.trash(refs) 

1713 self.registry.removeCollection(name) 

1714 

1715 if unstore: 

1716 # Point of no return for removing artifacts 

1717 self.datastore.emptyTrash() 

1718 

1719 def pruneDatasets( 

1720 self, 

1721 refs: Iterable[DatasetRef], 

1722 *, 

1723 disassociate: bool = True, 

1724 unstore: bool = False, 

1725 tags: Iterable[str] = (), 

1726 purge: bool = False, 

1727 ) -> None: 

1728 # docstring inherited from LimitedButler 

1729 

1730 if not self.isWriteable(): 

1731 raise TypeError("Butler is read-only.") 

1732 if purge: 

1733 if not disassociate: 

1734 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1735 if not unstore: 

1736 raise TypeError("Cannot pass purge=True without unstore=True.") 

1737 elif disassociate: 

1738 tags = tuple(tags) 

1739 if not tags: 

1740 raise TypeError("No tags provided but disassociate=True.") 

1741 for tag in tags: 

1742 collectionType = self.registry.getCollectionType(tag) 

1743 if collectionType is not CollectionType.TAGGED: 

1744 raise TypeError( 

1745 f"Cannot disassociate from collection '{tag}' " 

1746 f"of non-TAGGED type {collectionType.name}." 

1747 ) 

1748 # For an execution butler we want to keep existing UUIDs for the 

1749 # datasets, for that we need to keep them in the collections but 

1750 # remove from datastore. 

1751 if self._allow_put_of_predefined_dataset and purge: 

1752 purge = False 

1753 disassociate = False 

1754 # Transform possibly-single-pass iterable into something we can iterate 

1755 # over multiple times. 

1756 refs = list(refs) 

1757 # Pruning a component of a DatasetRef makes no sense since registry 

1758 # doesn't know about components and datastore might not store 

1759 # components in a separate file 

1760 for ref in refs: 

1761 if ref.datasetType.component(): 

1762 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1763 # We don't need an unreliable Datastore transaction for this, because 

1764 # we've been extra careful to ensure that Datastore.trash only involves 

1765 # mutating the Registry (it can _look_ at Datastore-specific things, 

1766 # but shouldn't change them), and hence all operations here are 

1767 # Registry operations. 

1768 with self.registry.transaction(): 

1769 if unstore: 

1770 self.datastore.trash(refs) 

1771 if purge: 

1772 self.registry.removeDatasets(refs) 

1773 elif disassociate: 

1774 assert tags, "Guaranteed by earlier logic in this function." 

1775 for tag in tags: 

1776 self.registry.disassociate(tag, refs) 

1777 # We've exited the Registry transaction, and apparently committed. 

1778 # (if there was an exception, everything rolled back, and it's as if 

1779 # nothing happened - and we never get here). 

1780 # Datastore artifacts are not yet gone, but they're clearly marked 

1781 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1782 # problems we can try again later, and if manual administrative 

1783 # intervention is required, it's pretty clear what that should entail: 

1784 # deleting everything on disk and in private Datastore tables that is 

1785 # in the dataset_location_trash table. 

1786 if unstore: 

1787 # Point of no return for removing artifacts 

1788 self.datastore.emptyTrash() 

1789 

1790 @transactional 

1791 def ingest( 

1792 self, 

1793 *datasets: FileDataset, 

1794 transfer: Optional[str] = "auto", 

1795 run: Optional[str] = None, 

1796 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1797 record_validation_info: bool = True, 

1798 ) -> None: 

1799 """Store and register one or more datasets that already exist on disk. 

1800 

1801 Parameters 

1802 ---------- 

1803 datasets : `FileDataset` 

1804 Each positional argument is a struct containing information about 

1805 a file to be ingested, including its URI (either absolute or 

1806 relative to the datastore root, if applicable), a `DatasetRef`, 

1807 and optionally a formatter class or its fully-qualified string 

1808 name. If a formatter is not provided, the formatter that would be 

1809 used for `put` is assumed. On successful return, all 

1810 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1811 attribute populated and all `FileDataset.formatter` attributes will 

1812 be set to the formatter class used. `FileDataset.path` attributes 

1813 may be modified to put paths in whatever the datastore considers a 

1814 standardized form. 

1815 transfer : `str`, optional 

1816 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1817 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1818 transfer the file. 

1819 run : `str`, optional 

1820 The name of the run ingested datasets should be added to, 

1821 overriding ``self.run``. 

1822 idGenerationMode : `DatasetIdGenEnum`, optional 

1823 Specifies option for generating dataset IDs. By default unique IDs 

1824 are generated for each inserted dataset. 

1825 record_validation_info : `bool`, optional 

1826 If `True`, the default, the datastore can record validation 

1827 information associated with the file. If `False` the datastore 

1828 will not attempt to track any information such as checksums 

1829 or file sizes. This can be useful if such information is tracked 

1830 in an external system or if the file is to be compressed in place. 

1831 It is up to the datastore whether this parameter is relevant. 

1832 

1833 Raises 

1834 ------ 

1835 TypeError 

1836 Raised if the butler is read-only or if no run was provided. 

1837 NotImplementedError 

1838 Raised if the `Datastore` does not support the given transfer mode. 

1839 DatasetTypeNotSupportedError 

1840 Raised if one or more files to be ingested have a dataset type that 

1841 is not supported by the `Datastore`.. 

1842 FileNotFoundError 

1843 Raised if one of the given files does not exist. 

1844 FileExistsError 

1845 Raised if transfer is not `None` but the (internal) location the 

1846 file would be moved to is already occupied. 

1847 

1848 Notes 

1849 ----- 

1850 This operation is not fully exception safe: if a database operation 

1851 fails, the given `FileDataset` instances may be only partially updated. 

1852 

1853 It is atomic in terms of database operations (they will either all 

1854 succeed or all fail) providing the database engine implements 

1855 transactions correctly. It will attempt to be atomic in terms of 

1856 filesystem operations as well, but this cannot be implemented 

1857 rigorously for most datastores. 

1858 """ 

1859 if not self.isWriteable(): 

1860 raise TypeError("Butler is read-only.") 

1861 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1862 # Reorganize the inputs so they're grouped by DatasetType and then 

1863 # data ID. We also include a list of DatasetRefs for each FileDataset 

1864 # to hold the resolved DatasetRefs returned by the Registry, before 

1865 # it's safe to swap them into FileDataset.refs. 

1866 # Some type annotation aliases to make that clearer: 

1867 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1868 GroupedData = MutableMapping[DatasetType, GroupForType] 

1869 # The actual data structure: 

1870 groupedData: GroupedData = defaultdict(dict) 

1871 # And the nested loop that populates it: 

1872 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1873 # This list intentionally shared across the inner loop, since it's 

1874 # associated with `dataset`. 

1875 resolvedRefs: List[DatasetRef] = [] 

1876 

1877 # Somewhere to store pre-existing refs if we have an 

1878 # execution butler. 

1879 existingRefs: List[DatasetRef] = [] 

1880 

1881 for ref in dataset.refs: 

1882 if ref.dataId in groupedData[ref.datasetType]: 

1883 raise ConflictingDefinitionError( 

1884 f"Ingest conflict. Dataset {dataset.path} has same" 

1885 " DataId as other ingest dataset" 

1886 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1887 f" ({ref.dataId})" 

1888 ) 

1889 if self._allow_put_of_predefined_dataset: 

1890 existing_ref = self.registry.findDataset( 

1891 ref.datasetType, dataId=ref.dataId, collections=run 

1892 ) 

1893 if existing_ref: 

1894 if self.datastore.knows(existing_ref): 

1895 raise ConflictingDefinitionError( 

1896 f"Dataset associated with path {dataset.path}" 

1897 f" already exists as {existing_ref}." 

1898 ) 

1899 # Store this ref elsewhere since it already exists 

1900 # and we do not want to remake it but we do want 

1901 # to store it in the datastore. 

1902 existingRefs.append(existing_ref) 

1903 

1904 # Nothing else to do until we have finished 

1905 # iterating. 

1906 continue 

1907 

1908 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1909 

1910 if existingRefs: 

1911 

1912 if len(dataset.refs) != len(existingRefs): 

1913 # Keeping track of partially pre-existing datasets is hard 

1914 # and should generally never happen. For now don't allow 

1915 # it. 

1916 raise ConflictingDefinitionError( 

1917 f"For dataset {dataset.path} some dataIds already exist" 

1918 " in registry but others do not. This is not supported." 

1919 ) 

1920 

1921 # Attach the resolved refs if we found them. 

1922 dataset.refs = existingRefs 

1923 

1924 # Now we can bulk-insert into Registry for each DatasetType. 

1925 for datasetType, groupForType in progress.iter_item_chunks( 

1926 groupedData.items(), desc="Bulk-inserting datasets by type" 

1927 ): 

1928 refs = self.registry.insertDatasets( 

1929 datasetType, 

1930 dataIds=groupForType.keys(), 

1931 run=run, 

1932 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1933 idGenerationMode=idGenerationMode, 

1934 ) 

1935 # Append those resolved DatasetRefs to the new lists we set up for 

1936 # them. 

1937 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1938 resolvedRefs.append(ref) 

1939 

1940 # Go back to the original FileDatasets to replace their refs with the 

1941 # new resolved ones. 

1942 for groupForType in progress.iter_chunks( 

1943 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1944 ): 

1945 for dataset, resolvedRefs in groupForType.values(): 

1946 dataset.refs = resolvedRefs 

1947 

1948 # Bulk-insert everything into Datastore. 

1949 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1950 

1951 @contextlib.contextmanager 

1952 def export( 

1953 self, 

1954 *, 

1955 directory: Optional[str] = None, 

1956 filename: Optional[str] = None, 

1957 format: Optional[str] = None, 

1958 transfer: Optional[str] = None, 

1959 ) -> Iterator[RepoExportContext]: 

1960 """Export datasets from the repository represented by this `Butler`. 

1961 

1962 This method is a context manager that returns a helper object 

1963 (`RepoExportContext`) that is used to indicate what information from 

1964 the repository should be exported. 

1965 

1966 Parameters 

1967 ---------- 

1968 directory : `str`, optional 

1969 Directory dataset files should be written to if ``transfer`` is not 

1970 `None`. 

1971 filename : `str`, optional 

1972 Name for the file that will include database information associated 

1973 with the exported datasets. If this is not an absolute path and 

1974 ``directory`` is not `None`, it will be written to ``directory`` 

1975 instead of the current working directory. Defaults to 

1976 "export.{format}". 

1977 format : `str`, optional 

1978 File format for the database information file. If `None`, the 

1979 extension of ``filename`` will be used. 

1980 transfer : `str`, optional 

1981 Transfer mode passed to `Datastore.export`. 

1982 

1983 Raises 

1984 ------ 

1985 TypeError 

1986 Raised if the set of arguments passed is inconsistent. 

1987 

1988 Examples 

1989 -------- 

1990 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1991 methods are used to provide the iterables over data IDs and/or datasets 

1992 to be exported:: 

1993 

1994 with butler.export("exports.yaml") as export: 

1995 # Export all flats, but none of the dimension element rows 

1996 # (i.e. data ID information) associated with them. 

1997 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1998 elements=()) 

1999 # Export all datasets that start with "deepCoadd_" and all of 

2000 # their associated data ID information. 

2001 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2002 """ 

2003 if directory is None and transfer is not None: 

2004 raise TypeError("Cannot transfer without providing a directory.") 

2005 if transfer == "move": 

2006 raise TypeError("Transfer may not be 'move': export is read-only") 

2007 if format is None: 

2008 if filename is None: 

2009 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2010 else: 

2011 _, format = os.path.splitext(filename) 

2012 elif filename is None: 

2013 filename = f"export.{format}" 

2014 if directory is not None: 

2015 filename = os.path.join(directory, filename) 

2016 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

2017 with open(filename, "w") as stream: 

2018 backend = BackendClass(stream, universe=self.registry.dimensions) 

2019 try: 

2020 helper = RepoExportContext( 

2021 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2022 ) 

2023 yield helper 

2024 except BaseException: 

2025 raise 

2026 else: 

2027 helper._finish() 

2028 

2029 def import_( 

2030 self, 

2031 *, 

2032 directory: Optional[str] = None, 

2033 filename: Union[str, TextIO, None] = None, 

2034 format: Optional[str] = None, 

2035 transfer: Optional[str] = None, 

2036 skip_dimensions: Optional[Set] = None, 

2037 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2038 reuseIds: bool = False, 

2039 ) -> None: 

2040 """Import datasets into this repository that were exported from a 

2041 different butler repository via `~lsst.daf.butler.Butler.export`. 

2042 

2043 Parameters 

2044 ---------- 

2045 directory : `str`, optional 

2046 Directory containing dataset files to import from. If `None`, 

2047 ``filename`` and all dataset file paths specified therein must 

2048 be absolute. 

2049 filename : `str` or `TextIO`, optional 

2050 A stream or name of file that contains database information 

2051 associated with the exported datasets, typically generated by 

2052 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

2053 is not an absolute path, does not exist in the current working 

2054 directory, and ``directory`` is not `None`, it is assumed to be in 

2055 ``directory``. Defaults to "export.{format}". 

2056 format : `str`, optional 

2057 File format for ``filename``. If `None`, the extension of 

2058 ``filename`` will be used. 

2059 transfer : `str`, optional 

2060 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2061 skip_dimensions : `set`, optional 

2062 Names of dimensions that should be skipped and not imported. 

2063 idGenerationMode : `DatasetIdGenEnum`, optional 

2064 Specifies option for generating dataset IDs when IDs are not 

2065 provided or their type does not match backend type. By default 

2066 unique IDs are generated for each inserted dataset. 

2067 reuseIds : `bool`, optional 

2068 If `True` then forces re-use of imported dataset IDs for integer 

2069 IDs which are normally generated as auto-incremented; exception 

2070 will be raised if imported IDs clash with existing ones. This 

2071 option has no effect on the use of globally-unique IDs which are 

2072 always re-used (or generated if integer IDs are being imported). 

2073 

2074 Raises 

2075 ------ 

2076 TypeError 

2077 Raised if the set of arguments passed is inconsistent, or if the 

2078 butler is read-only. 

2079 """ 

2080 if not self.isWriteable(): 

2081 raise TypeError("Butler is read-only.") 

2082 if format is None: 

2083 if filename is None: 

2084 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2085 else: 

2086 _, format = os.path.splitext(filename) # type: ignore 

2087 elif filename is None: 

2088 filename = f"export.{format}" 

2089 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2090 filename = os.path.join(directory, filename) 

2091 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2092 

2093 def doImport(importStream: TextIO) -> None: 

2094 backend = BackendClass(importStream, self.registry) 

2095 backend.register() 

2096 with self.transaction(): 

2097 backend.load( 

2098 self.datastore, 

2099 directory=directory, 

2100 transfer=transfer, 

2101 skip_dimensions=skip_dimensions, 

2102 idGenerationMode=idGenerationMode, 

2103 reuseIds=reuseIds, 

2104 ) 

2105 

2106 if isinstance(filename, str): 

2107 with open(filename, "r") as stream: 

2108 doImport(stream) 

2109 else: 

2110 doImport(filename) 

2111 

2112 def transfer_from( 

2113 self, 

2114 source_butler: Butler, 

2115 source_refs: Iterable[DatasetRef], 

2116 transfer: str = "auto", 

2117 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

2118 skip_missing: bool = True, 

2119 register_dataset_types: bool = False, 

2120 transfer_dimensions: bool = False, 

2121 ) -> List[DatasetRef]: 

2122 """Transfer datasets to this Butler from a run in another Butler. 

2123 

2124 Parameters 

2125 ---------- 

2126 source_butler : `Butler` 

2127 Butler from which the datasets are to be transferred. 

2128 source_refs : iterable of `DatasetRef` 

2129 Datasets defined in the source butler that should be transferred to 

2130 this butler. 

2131 transfer : `str`, optional 

2132 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2133 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2134 A mapping of dataset type to ID generation mode. Only used if 

2135 the source butler is using integer IDs. Should not be used 

2136 if this receiving butler uses integer IDs. Without this dataset 

2137 import always uses unique. 

2138 skip_missing : `bool` 

2139 If `True`, datasets with no datastore artifact associated with 

2140 them are not transferred. If `False` a registry entry will be 

2141 created even if no datastore record is created (and so will 

2142 look equivalent to the dataset being unstored). 

2143 register_dataset_types : `bool` 

2144 If `True` any missing dataset types are registered. Otherwise 

2145 an exception is raised. 

2146 transfer_dimensions : `bool`, optional 

2147 If `True`, dimension record data associated with the new datasets 

2148 will be transferred. 

2149 

2150 Returns 

2151 ------- 

2152 refs : `list` of `DatasetRef` 

2153 The refs added to this Butler. 

2154 

2155 Notes 

2156 ----- 

2157 Requires that any dimension definitions are already present in the 

2158 receiving Butler. The datastore artifact has to exist for a transfer 

2159 to be made but non-existence is not an error. 

2160 

2161 Datasets that already exist in this run will be skipped. 

2162 

2163 The datasets are imported as part of a transaction, although 

2164 dataset types are registered before the transaction is started. 

2165 This means that it is possible for a dataset type to be registered 

2166 even though transfer has failed. 

2167 """ 

2168 if not self.isWriteable(): 

2169 raise TypeError("Butler is read-only.") 

2170 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2171 

2172 # Will iterate through the refs multiple times so need to convert 

2173 # to a list if this isn't a collection. 

2174 if not isinstance(source_refs, collections.abc.Collection): 

2175 source_refs = list(source_refs) 

2176 

2177 original_count = len(source_refs) 

2178 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2179 

2180 if id_gen_map is None: 

2181 id_gen_map = {} 

2182 

2183 # In some situations the datastore artifact may be missing 

2184 # and we do not want that registry entry to be imported. 

2185 # Asking datastore is not sufficient, the records may have been 

2186 # purged, we have to ask for the (predicted) URI and check 

2187 # existence explicitly. Execution butler is set up exactly like 

2188 # this with no datastore records. 

2189 artifact_existence: Dict[ResourcePath, bool] = {} 

2190 if skip_missing: 

2191 dataset_existence = source_butler.datastore.mexists( 

2192 source_refs, artifact_existence=artifact_existence 

2193 ) 

2194 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2195 filtered_count = len(source_refs) 

2196 log.verbose( 

2197 "%d datasets removed because the artifact does not exist. Now have %d.", 

2198 original_count - filtered_count, 

2199 filtered_count, 

2200 ) 

2201 

2202 # Importing requires that we group the refs by dataset type and run 

2203 # before doing the import. 

2204 source_dataset_types = set() 

2205 grouped_refs = defaultdict(list) 

2206 grouped_indices = defaultdict(list) 

2207 for i, ref in enumerate(source_refs): 

2208 grouped_refs[ref.datasetType, ref.run].append(ref) 

2209 grouped_indices[ref.datasetType, ref.run].append(i) 

2210 source_dataset_types.add(ref.datasetType) 

2211 

2212 # Check to see if the dataset type in the source butler has 

2213 # the same definition in the target butler and register missing 

2214 # ones if requested. Registration must happen outside a transaction. 

2215 newly_registered_dataset_types = set() 

2216 for datasetType in source_dataset_types: 

2217 if register_dataset_types: 

2218 # Let this raise immediately if inconsistent. Continuing 

2219 # on to find additional inconsistent dataset types 

2220 # might result in additional unwanted dataset types being 

2221 # registered. 

2222 if self.registry.registerDatasetType(datasetType): 

2223 newly_registered_dataset_types.add(datasetType) 

2224 else: 

2225 # If the dataset type is missing, let it fail immediately. 

2226 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2227 if target_dataset_type != datasetType: 

2228 raise ConflictingDefinitionError( 

2229 "Source butler dataset type differs from definition" 

2230 f" in target butler: {datasetType} !=" 

2231 f" {target_dataset_type}" 

2232 ) 

2233 if newly_registered_dataset_types: 

2234 # We may have registered some even if there were inconsistencies 

2235 # but should let people know (or else remove them again). 

2236 log.log( 

2237 VERBOSE, 

2238 "Registered the following dataset types in the target Butler: %s", 

2239 ", ".join(d.name for d in newly_registered_dataset_types), 

2240 ) 

2241 else: 

2242 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2243 

2244 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2245 if transfer_dimensions: 

2246 # Collect all the dimension records for these refs. 

2247 # All dimensions are to be copied but the list of valid dimensions 

2248 # come from this butler's universe. 

2249 elements = frozenset( 

2250 element 

2251 for element in self.registry.dimensions.getStaticElements() 

2252 if element.hasTable() and element.viewOf is None 

2253 ) 

2254 dataIds = set(ref.dataId for ref in source_refs) 

2255 # This logic comes from saveDataIds. 

2256 for dataId in dataIds: 

2257 # Should be a no-op if the ref has already been expanded. 

2258 dataId = source_butler.registry.expandDataId(dataId) 

2259 # If this butler doesn't know about a dimension in the source 

2260 # butler things will break later. 

2261 for record in dataId.records.values(): 

2262 if record is not None and record.definition in elements: 

2263 dimension_records[record.definition].setdefault(record.dataId, record) 

2264 

2265 # The returned refs should be identical for UUIDs. 

2266 # For now must also support integers and so need to retain the 

2267 # newly-created refs from this registry. 

2268 # Pre-size it so we can assign refs into the correct slots 

2269 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2270 default_id_gen = DatasetIdGenEnum.UNIQUE 

2271 

2272 handled_collections: Set[str] = set() 

2273 

2274 # Do all the importing in a single transaction. 

2275 with self.transaction(): 

2276 if dimension_records: 

2277 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2278 for element, r in dimension_records.items(): 

2279 records = [r[dataId] for dataId in r] 

2280 # Assume that if the record is already present that we can 

2281 # use it without having to check that the record metadata 

2282 # is consistent. 

2283 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2284 

2285 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2286 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2287 ): 

2288 if run not in handled_collections: 

2289 run_doc = source_butler.registry.getCollectionDocumentation(run) 

2290 registered = self.registry.registerRun(run, doc=run_doc) 

2291 handled_collections.add(run) 

2292 if registered: 

2293 log.log(VERBOSE, "Creating output run %s", run) 

2294 

2295 id_generation_mode = default_id_gen 

2296 if isinstance(refs_to_import[0].id, int): 

2297 # ID generation mode might need to be overridden when 

2298 # targetting UUID 

2299 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2300 

2301 n_refs = len(refs_to_import) 

2302 log.verbose( 

2303 "Importing %d ref%s of dataset type %s into run %s", 

2304 n_refs, 

2305 "" if n_refs == 1 else "s", 

2306 datasetType.name, 

2307 run, 

2308 ) 

2309 

2310 # No way to know if this butler's registry uses UUID. 

2311 # We have to trust the caller on this. If it fails they will 

2312 # have to change their approach. We can't catch the exception 

2313 # and retry with unique because that will mess up the 

2314 # transaction handling. We aren't allowed to ask the registry 

2315 # manager what type of ID it is using. 

2316 imported_refs = self.registry._importDatasets( 

2317 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2318 ) 

2319 

2320 # Map them into the correct slots to match the initial order 

2321 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2322 transferred_refs_tmp[i] = ref 

2323 

2324 # Mypy insists that we might have None in here so we have to make 

2325 # that explicit by assigning to a new variable and filtering out 

2326 # something that won't be there. 

2327 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2328 

2329 # Check consistency 

2330 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2331 

2332 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2333 

2334 # The transferred refs need to be reordered to match the original 

2335 # ordering given by the caller. Without this the datastore transfer 

2336 # will be broken. 

2337 

2338 # Ask the datastore to transfer. The datastore has to check that 

2339 # the source datastore is compatible with the target datastore. 

2340 self.datastore.transfer_from( 

2341 source_butler.datastore, 

2342 source_refs, 

2343 local_refs=transferred_refs, 

2344 transfer=transfer, 

2345 artifact_existence=artifact_existence, 

2346 ) 

2347 

2348 return transferred_refs 

2349 

2350 def validateConfiguration( 

2351 self, 

2352 logFailures: bool = False, 

2353 datasetTypeNames: Optional[Iterable[str]] = None, 

2354 ignore: Iterable[str] = None, 

2355 ) -> None: 

2356 """Validate butler configuration. 

2357 

2358 Checks that each `DatasetType` can be stored in the `Datastore`. 

2359 

2360 Parameters 

2361 ---------- 

2362 logFailures : `bool`, optional 

2363 If `True`, output a log message for every validation error 

2364 detected. 

2365 datasetTypeNames : iterable of `str`, optional 

2366 The `DatasetType` names that should be checked. This allows 

2367 only a subset to be selected. 

2368 ignore : iterable of `str`, optional 

2369 Names of DatasetTypes to skip over. This can be used to skip 

2370 known problems. If a named `DatasetType` corresponds to a 

2371 composite, all components of that `DatasetType` will also be 

2372 ignored. 

2373 

2374 Raises 

2375 ------ 

2376 ButlerValidationError 

2377 Raised if there is some inconsistency with how this Butler 

2378 is configured. 

2379 """ 

2380 if datasetTypeNames: 

2381 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2382 else: 

2383 datasetTypes = list(self.registry.queryDatasetTypes()) 

2384 

2385 # filter out anything from the ignore list 

2386 if ignore: 

2387 ignore = set(ignore) 

2388 datasetTypes = [ 

2389 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2390 ] 

2391 else: 

2392 ignore = set() 

2393 

2394 # Find all the registered instruments 

2395 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2396 

2397 # For each datasetType that has an instrument dimension, create 

2398 # a DatasetRef for each defined instrument 

2399 datasetRefs = [] 

2400 

2401 for datasetType in datasetTypes: 

2402 if "instrument" in datasetType.dimensions: 

2403 for instrument in instruments: 

2404 datasetRef = DatasetRef( 

2405 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2406 ) 

2407 datasetRefs.append(datasetRef) 

2408 

2409 entities: List[Union[DatasetType, DatasetRef]] = [] 

2410 entities.extend(datasetTypes) 

2411 entities.extend(datasetRefs) 

2412 

2413 datastoreErrorStr = None 

2414 try: 

2415 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2416 except ValidationError as e: 

2417 datastoreErrorStr = str(e) 

2418 

2419 # Also check that the LookupKeys used by the datastores match 

2420 # registry and storage class definitions 

2421 keys = self.datastore.getLookupKeys() 

2422 

2423 failedNames = set() 

2424 failedDataId = set() 

2425 for key in keys: 

2426 if key.name is not None: 

2427 if key.name in ignore: 

2428 continue 

2429 

2430 # skip if specific datasetType names were requested and this 

2431 # name does not match 

2432 if datasetTypeNames and key.name not in datasetTypeNames: 

2433 continue 

2434 

2435 # See if it is a StorageClass or a DatasetType 

2436 if key.name in self.storageClasses: 

2437 pass 

2438 else: 

2439 try: 

2440 self.registry.getDatasetType(key.name) 

2441 except KeyError: 

2442 if logFailures: 

2443 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2444 failedNames.add(key) 

2445 else: 

2446 # Dimensions are checked for consistency when the Butler 

2447 # is created and rendezvoused with a universe. 

2448 pass 

2449 

2450 # Check that the instrument is a valid instrument 

2451 # Currently only support instrument so check for that 

2452 if key.dataId: 

2453 dataIdKeys = set(key.dataId) 

2454 if set(["instrument"]) != dataIdKeys: 

2455 if logFailures: 

2456 log.critical("Key '%s' has unsupported DataId override", key) 

2457 failedDataId.add(key) 

2458 elif key.dataId["instrument"] not in instruments: 

2459 if logFailures: 

2460 log.critical("Key '%s' has unknown instrument", key) 

2461 failedDataId.add(key) 

2462 

2463 messages = [] 

2464 

2465 if datastoreErrorStr: 

2466 messages.append(datastoreErrorStr) 

2467 

2468 for failed, msg in ( 

2469 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2470 (failedDataId, "Keys with bad DataId entries: "), 

2471 ): 

2472 if failed: 

2473 msg += ", ".join(str(k) for k in failed) 

2474 messages.append(msg) 

2475 

2476 if messages: 

2477 raise ValidationError(";\n".join(messages)) 

2478 

2479 @property 

2480 def collections(self) -> CollectionSearch: 

2481 """The collections to search by default, in order (`CollectionSearch`). 

2482 

2483 This is an alias for ``self.registry.defaults.collections``. It cannot 

2484 be set directly in isolation, but all defaults may be changed together 

2485 by assigning a new `RegistryDefaults` instance to 

2486 ``self.registry.defaults``. 

2487 """ 

2488 return self.registry.defaults.collections 

2489 

2490 @property 

2491 def run(self) -> Optional[str]: 

2492 """Name of the run this butler writes outputs to by default (`str` or 

2493 `None`). 

2494 

2495 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2496 directly in isolation, but all defaults may be changed together by 

2497 assigning a new `RegistryDefaults` instance to 

2498 ``self.registry.defaults``. 

2499 """ 

2500 return self.registry.defaults.run 

2501 

2502 @property 

2503 def dimensions(self) -> DimensionUniverse: 

2504 # Docstring inherited. 

2505 return self.registry.dimensions 

2506 

2507 registry: Registry 

2508 """The object that manages dataset metadata and relationships (`Registry`). 

2509 

2510 Most operations that don't involve reading or writing butler datasets are 

2511 accessible only via `Registry` methods. 

2512 """ 

2513 

2514 datastore: Datastore 

2515 """The object that manages actual dataset storage (`Datastore`). 

2516 

2517 Direct user access to the datastore should rarely be necessary; the primary 

2518 exception is the case where a `Datastore` implementation provides extra 

2519 functionality beyond what the base class defines. 

2520 """ 

2521 

2522 storageClasses: StorageClassFactory 

2523 """An object that maps known storage class names to objects that fully 

2524 describe them (`StorageClassFactory`). 

2525 """ 

2526 

2527 _allow_put_of_predefined_dataset: bool 

2528 """Allow a put to succeed even if there is already a registry entry for it 

2529 but not a datastore record. (`bool`)."""