Coverage for python/lsst/daf/butler/_butler.py: 10%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

603 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImportType 

65from lsst.utils.introspection import get_class_of 

66from lsst.utils.logging import getLogger, VERBOSE 

67from .core import ( 

68 AmbiguousDatasetError, 

69 ButlerURI, 

70 Config, 

71 ConfigSubset, 

72 DataCoordinate, 

73 DataId, 

74 DataIdValue, 

75 DatasetRef, 

76 DatasetType, 

77 Datastore, 

78 Dimension, 

79 DimensionConfig, 

80 FileDataset, 

81 Progress, 

82 StorageClassFactory, 

83 Timespan, 

84 ValidationError, 

85) 

86from .core.repoRelocation import BUTLER_ROOT_TAG 

87from .core.utils import transactional 

88from ._deferredDatasetHandle import DeferredDatasetHandle 

89from ._butlerConfig import ButlerConfig 

90from ._butlerRepoIndex import ButlerRepoIndex 

91from .registry import ( 

92 Registry, 

93 RegistryConfig, 

94 RegistryDefaults, 

95 CollectionSearch, 

96 CollectionType, 

97 ConflictingDefinitionError, 

98 DatasetIdGenEnum, 

99) 

100from .transfers import RepoExportContext 

101 

102log = getLogger(__name__) 

103 

104 

105class ButlerValidationError(ValidationError): 

106 """There is a problem with the Butler configuration.""" 

107 pass 

108 

109 

110class PruneCollectionsArgsError(TypeError): 

111 """Base class for errors relating to Butler.pruneCollections input 

112 arguments. 

113 """ 

114 pass 

115 

116 

117class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

118 """Raised when purge and unstore are both required to be True, and 

119 purge is True but unstore is False. 

120 """ 

121 

122 def __init__(self) -> None: 

123 super().__init__("Cannot pass purge=True without unstore=True.") 

124 

125 

126class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

127 """Raised when pruning a RUN collection but purge is False.""" 

128 

129 def __init__(self, collectionType: CollectionType): 

130 self.collectionType = collectionType 

131 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

132 

133 

134class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

135 """Raised when purge is True but is not supported for the given 

136 collection.""" 

137 

138 def __init__(self, collectionType: CollectionType): 

139 self.collectionType = collectionType 

140 super().__init__( 

141 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

142 

143 

144class Butler: 

145 """Main entry point for the data access system. 

146 

147 Parameters 

148 ---------- 

149 config : `ButlerConfig`, `Config` or `str`, optional. 

150 Configuration. Anything acceptable to the 

151 `ButlerConfig` constructor. If a directory path 

152 is given the configuration will be read from a ``butler.yaml`` file in 

153 that location. If `None` is given default values will be used. 

154 butler : `Butler`, optional. 

155 If provided, construct a new Butler that uses the same registry and 

156 datastore as the given one, but with the given collection and run. 

157 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

158 arguments. 

159 collections : `str` or `Iterable` [ `str` ], optional 

160 An expression specifying the collections to be searched (in order) when 

161 reading datasets. 

162 This may be a `str` collection name or an iterable thereof. 

163 See :ref:`daf_butler_collection_expressions` for more information. 

164 These collections are not registered automatically and must be 

165 manually registered before they are used by any method, but they may be 

166 manually registered after the `Butler` is initialized. 

167 run : `str`, optional 

168 Name of the `~CollectionType.RUN` collection new datasets should be 

169 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

170 ``collections`` will be set to ``[run]``. If not `None`, this 

171 collection will automatically be registered. If this is not set (and 

172 ``writeable`` is not set either), a read-only butler will be created. 

173 searchPaths : `list` of `str`, optional 

174 Directory paths to search when calculating the full Butler 

175 configuration. Not used if the supplied config is already a 

176 `ButlerConfig`. 

177 writeable : `bool`, optional 

178 Explicitly sets whether the butler supports write operations. If not 

179 provided, a read-write butler is created if any of ``run``, ``tags``, 

180 or ``chains`` is non-empty. 

181 inferDefaults : `bool`, optional 

182 If `True` (default) infer default data ID values from the values 

183 present in the datasets in ``collections``: if all collections have the 

184 same value (or no value) for a governor dimension, that value will be 

185 the default for that dimension. Nonexistent collections are ignored. 

186 If a default value is provided explicitly for a governor dimension via 

187 ``**kwargs``, no default will be inferred for that dimension. 

188 **kwargs : `str` 

189 Default data ID key-value pairs. These may only identify "governor" 

190 dimensions like ``instrument`` and ``skymap``. 

191 

192 Examples 

193 -------- 

194 While there are many ways to control exactly how a `Butler` interacts with 

195 the collections in its `Registry`, the most common cases are still simple. 

196 

197 For a read-only `Butler` that searches one collection, do:: 

198 

199 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

200 

201 For a read-write `Butler` that writes to and reads from a 

202 `~CollectionType.RUN` collection:: 

203 

204 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

205 

206 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

207 because we want to write to one `~CollectionType.RUN` collection but read 

208 from several others (as well):: 

209 

210 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

211 collections=["u/alice/DM-50000/a", 

212 "u/bob/DM-49998", 

213 "HSC/defaults"]) 

214 

215 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

216 Datasets will be read first from that run (since it appears first in the 

217 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

218 

219 Finally, one can always create a `Butler` with no collections:: 

220 

221 butler = Butler("/path/to/repo", writeable=True) 

222 

223 This can be extremely useful when you just want to use ``butler.registry``, 

224 e.g. for inserting dimension data or managing collections, or when the 

225 collections you want to use with the butler are not consistent. 

226 Passing ``writeable`` explicitly here is only necessary if you want to be 

227 able to make changes to the repo - usually the value for ``writeable`` can 

228 be guessed from the collection arguments provided, but it defaults to 

229 `False` when there are not collection arguments. 

230 """ 

231 def __init__(self, config: Union[Config, str, None] = None, *, 

232 butler: Optional[Butler] = None, 

233 collections: Any = None, 

234 run: Optional[str] = None, 

235 searchPaths: Optional[List[str]] = None, 

236 writeable: Optional[bool] = None, 

237 inferDefaults: bool = True, 

238 **kwargs: str, 

239 ): 

240 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

241 # Load registry, datastore, etc. from config or existing butler. 

242 if butler is not None: 

243 if config is not None or searchPaths is not None or writeable is not None: 

244 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

245 "arguments with 'butler' argument.") 

246 self.registry = butler.registry.copy(defaults) 

247 self.datastore = butler.datastore 

248 self.storageClasses = butler.storageClasses 

249 self._config: ButlerConfig = butler._config 

250 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

251 else: 

252 self._config = ButlerConfig(config, searchPaths=searchPaths) 

253 try: 

254 if "root" in self._config: 

255 butlerRoot = self._config["root"] 

256 else: 

257 butlerRoot = self._config.configDir 

258 if writeable is None: 

259 writeable = run is not None 

260 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

261 defaults=defaults) 

262 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

263 butlerRoot=butlerRoot) 

264 self.storageClasses = StorageClassFactory() 

265 self.storageClasses.addFromConfig(self._config) 

266 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset", 

267 False) 

268 except Exception: 

269 # Failures here usually mean that configuration is incomplete, 

270 # just issue an error message which includes config file URI. 

271 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

272 raise 

273 

274 if "run" in self._config or "collection" in self._config: 

275 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

276 

277 GENERATION: ClassVar[int] = 3 

278 """This is a Generation 3 Butler. 

279 

280 This attribute may be removed in the future, once the Generation 2 Butler 

281 interface has been fully retired; it should only be used in transitional 

282 code. 

283 """ 

284 

285 @classmethod 

286 def get_repo_uri(cls, label: str) -> ButlerURI: 

287 """Look up the label in a butler repository index. 

288 

289 Parameters 

290 ---------- 

291 label : `str` 

292 Label of the Butler repository to look up. 

293 

294 Returns 

295 ------- 

296 uri : `ButlerURI` 

297 URI to the Butler repository associated with the given label. 

298 

299 Raises 

300 ------ 

301 KeyError 

302 Raised if the label is not found in the index, or if an index 

303 can not be found at all. 

304 

305 Notes 

306 ----- 

307 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

308 information is discovered. 

309 """ 

310 return ButlerRepoIndex.get_repo_uri(label) 

311 

312 @classmethod 

313 def get_known_repos(cls) -> Set[str]: 

314 """Retrieve the list of known repository labels. 

315 

316 Returns 

317 ------- 

318 repos : `set` of `str` 

319 All the known labels. Can be empty if no index can be found. 

320 

321 Notes 

322 ----- 

323 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

324 information is discovered. 

325 """ 

326 return ButlerRepoIndex.get_known_repos() 

327 

328 @staticmethod 

329 def makeRepo(root: str, config: Union[Config, str, None] = None, 

330 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

331 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

332 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

333 """Create an empty data repository by adding a butler.yaml config 

334 to a repository root directory. 

335 

336 Parameters 

337 ---------- 

338 root : `str` or `ButlerURI` 

339 Path or URI to the root location of the new repository. Will be 

340 created if it does not exist. 

341 config : `Config` or `str`, optional 

342 Configuration to write to the repository, after setting any 

343 root-dependent Registry or Datastore config options. Can not 

344 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

345 configuration will be used. Root-dependent config options 

346 specified in this config are overwritten if ``forceConfigRoot`` 

347 is `True`. 

348 dimensionConfig : `Config` or `str`, optional 

349 Configuration for dimensions, will be used to initialize registry 

350 database. 

351 standalone : `bool` 

352 If True, write all expanded defaults, not just customized or 

353 repository-specific settings. 

354 This (mostly) decouples the repository from the default 

355 configuration, insulating it from changes to the defaults (which 

356 may be good or bad, depending on the nature of the changes). 

357 Future *additions* to the defaults will still be picked up when 

358 initializing `Butlers` to repos created with ``standalone=True``. 

359 searchPaths : `list` of `str`, optional 

360 Directory paths to search when calculating the full butler 

361 configuration. 

362 forceConfigRoot : `bool`, optional 

363 If `False`, any values present in the supplied ``config`` that 

364 would normally be reset are not overridden and will appear 

365 directly in the output config. This allows non-standard overrides 

366 of the root directory for a datastore or registry to be given. 

367 If this parameter is `True` the values for ``root`` will be 

368 forced into the resulting config if appropriate. 

369 outfile : `str`, optional 

370 If not-`None`, the output configuration will be written to this 

371 location rather than into the repository itself. Can be a URI 

372 string. Can refer to a directory that will be used to write 

373 ``butler.yaml``. 

374 overwrite : `bool`, optional 

375 Create a new configuration file even if one already exists 

376 in the specified output location. Default is to raise 

377 an exception. 

378 

379 Returns 

380 ------- 

381 config : `Config` 

382 The updated `Config` instance written to the repo. 

383 

384 Raises 

385 ------ 

386 ValueError 

387 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

388 regular Config (as these subclasses would make it impossible to 

389 support ``standalone=False``). 

390 FileExistsError 

391 Raised if the output config file already exists. 

392 os.error 

393 Raised if the directory does not exist, exists but is not a 

394 directory, or cannot be created. 

395 

396 Notes 

397 ----- 

398 Note that when ``standalone=False`` (the default), the configuration 

399 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

400 construct the repository should also be used to construct any Butlers 

401 to avoid configuration inconsistencies. 

402 """ 

403 if isinstance(config, (ButlerConfig, ConfigSubset)): 

404 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

405 

406 # Ensure that the root of the repository exists or can be made 

407 uri = ButlerURI(root, forceDirectory=True) 

408 uri.mkdir() 

409 

410 config = Config(config) 

411 

412 # If we are creating a new repo from scratch with relative roots, 

413 # do not propagate an explicit root from the config file 

414 if "root" in config: 

415 del config["root"] 

416 

417 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

418 imported_class = doImportType(full["datastore", "cls"]) 

419 if not issubclass(imported_class, Datastore): 

420 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

421 datastoreClass: Type[Datastore] = imported_class 

422 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

423 

424 # if key exists in given config, parse it, otherwise parse the defaults 

425 # in the expanded config 

426 if config.get(("registry", "db")): 

427 registryConfig = RegistryConfig(config) 

428 else: 

429 registryConfig = RegistryConfig(full) 

430 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

431 if defaultDatabaseUri is not None: 

432 Config.updateParameters(RegistryConfig, config, full, 

433 toUpdate={"db": defaultDatabaseUri}, 

434 overwrite=forceConfigRoot) 

435 else: 

436 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

437 overwrite=forceConfigRoot) 

438 

439 if standalone: 

440 config.merge(full) 

441 else: 

442 # Always expand the registry.managers section into the per-repo 

443 # config, because after the database schema is created, it's not 

444 # allowed to change anymore. Note that in the standalone=True 

445 # branch, _everything_ in the config is expanded, so there's no 

446 # need to special case this. 

447 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

448 configURI: Union[str, ButlerURI] 

449 if outfile is not None: 

450 # When writing to a separate location we must include 

451 # the root of the butler repo in the config else it won't know 

452 # where to look. 

453 config["root"] = uri.geturl() 

454 configURI = outfile 

455 else: 

456 configURI = uri 

457 config.dumpToUri(configURI, overwrite=overwrite) 

458 

459 # Create Registry and populate tables 

460 registryConfig = RegistryConfig(config.get("registry")) 

461 dimensionConfig = DimensionConfig(dimensionConfig) 

462 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

463 

464 log.verbose("Wrote new Butler configuration file to %s", configURI) 

465 

466 return config 

467 

468 @classmethod 

469 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

470 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

471 """Callable used to unpickle a Butler. 

472 

473 We prefer not to use ``Butler.__init__`` directly so we can force some 

474 of its many arguments to be keyword-only (note that ``__reduce__`` 

475 can only invoke callables with positional arguments). 

476 

477 Parameters 

478 ---------- 

479 config : `ButlerConfig` 

480 Butler configuration, already coerced into a true `ButlerConfig` 

481 instance (and hence after any search paths for overrides have been 

482 utilized). 

483 collections : `CollectionSearch` 

484 Names of the default collections to read from. 

485 run : `str`, optional 

486 Name of the default `~CollectionType.RUN` collection to write to. 

487 defaultDataId : `dict` [ `str`, `str` ] 

488 Default data ID values. 

489 writeable : `bool` 

490 Whether the Butler should support write operations. 

491 

492 Returns 

493 ------- 

494 butler : `Butler` 

495 A new `Butler` instance. 

496 """ 

497 # MyPy doesn't recognize that the kwargs below are totally valid; it 

498 # seems to think '**defaultDataId* is a _positional_ argument! 

499 return cls(config=config, collections=collections, run=run, writeable=writeable, 

500 **defaultDataId) # type: ignore 

501 

502 def __reduce__(self) -> tuple: 

503 """Support pickling. 

504 """ 

505 return (Butler._unpickle, (self._config, self.collections, self.run, 

506 self.registry.defaults.dataId.byName(), 

507 self.registry.isWriteable())) 

508 

509 def __str__(self) -> str: 

510 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

511 self.collections, self.run, self.datastore, self.registry) 

512 

513 def isWriteable(self) -> bool: 

514 """Return `True` if this `Butler` supports write operations. 

515 """ 

516 return self.registry.isWriteable() 

517 

518 @contextlib.contextmanager 

519 def transaction(self) -> Iterator[None]: 

520 """Context manager supporting `Butler` transactions. 

521 

522 Transactions can be nested. 

523 """ 

524 with self.registry.transaction(): 

525 with self.datastore.transaction(): 

526 yield 

527 

528 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

529 dataId: Optional[DataId] = None, **kwargs: Any 

530 ) -> Tuple[DatasetType, Optional[DataId]]: 

531 """Standardize the arguments passed to several Butler APIs. 

532 

533 Parameters 

534 ---------- 

535 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

536 When `DatasetRef` the `dataId` should be `None`. 

537 Otherwise the `DatasetType` or name thereof. 

538 dataId : `dict` or `DataCoordinate` 

539 A `dict` of `Dimension` link name, value pairs that label the 

540 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

541 should be provided as the second argument. 

542 **kwargs 

543 Additional keyword arguments used to augment or construct a 

544 `DataCoordinate`. See `DataCoordinate.standardize` 

545 parameters. 

546 

547 Returns 

548 ------- 

549 datasetType : `DatasetType` 

550 A `DatasetType` instance extracted from ``datasetRefOrType``. 

551 dataId : `dict` or `DataId`, optional 

552 Argument that can be used (along with ``kwargs``) to construct a 

553 `DataId`. 

554 

555 Notes 

556 ----- 

557 Butler APIs that conceptually need a DatasetRef also allow passing a 

558 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

559 keyword arguments that can be used to construct one) separately. This 

560 method accepts those arguments and always returns a true `DatasetType` 

561 and a `DataId` or `dict`. 

562 

563 Standardization of `dict` vs `DataId` is best handled by passing the 

564 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

565 generally similarly flexible. 

566 """ 

567 externalDatasetType: Optional[DatasetType] = None 

568 internalDatasetType: Optional[DatasetType] = None 

569 if isinstance(datasetRefOrType, DatasetRef): 

570 if dataId is not None or kwargs: 

571 raise ValueError("DatasetRef given, cannot use dataId as well") 

572 externalDatasetType = datasetRefOrType.datasetType 

573 dataId = datasetRefOrType.dataId 

574 else: 

575 # Don't check whether DataId is provided, because Registry APIs 

576 # can usually construct a better error message when it wasn't. 

577 if isinstance(datasetRefOrType, DatasetType): 

578 externalDatasetType = datasetRefOrType 

579 else: 

580 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

581 

582 # Check that they are self-consistent 

583 if externalDatasetType is not None: 

584 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

585 if externalDatasetType != internalDatasetType: 

586 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

587 f"registry definition ({internalDatasetType})") 

588 

589 assert internalDatasetType is not None 

590 return internalDatasetType, dataId 

591 

592 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType, 

593 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]: 

594 """Rewrite a data ID taking into account dimension records. 

595 

596 Take a Data ID and keyword args and rewrite it if necessary to 

597 allow the user to specify dimension records rather than dimension 

598 primary values. 

599 

600 This allows a user to include a dataId dict with keys of 

601 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

602 the integer exposure ID. It also allows a string to be given 

603 for a dimension value rather than the integer ID if that is more 

604 convenient. For example, rather than having to specifyin the 

605 detector with ``detector.full_name``, a string given for ``detector`` 

606 will be interpreted as the full name and converted to the integer 

607 value. 

608 

609 Keyword arguments can also use strings for dimensions like detector 

610 and exposure but python does not allow them to include ``.`` and 

611 so the ``exposure.day_obs`` syntax can not be used in a keyword 

612 argument. 

613 

614 Parameters 

615 ---------- 

616 dataId : `dict` or `DataCoordinate` 

617 A `dict` of `Dimension` link name, value pairs that will label the 

618 `DatasetRef` within a Collection. 

619 datasetType : `DatasetType` 

620 The dataset type associated with this dataId. Required to 

621 determine the relevant dimensions. 

622 **kwargs 

623 Additional keyword arguments used to augment or construct a 

624 `DataId`. See `DataId` parameters. 

625 

626 Returns 

627 ------- 

628 dataId : `dict` or `DataCoordinate` 

629 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

630 no keyword arguments, the original dataId will be returned 

631 unchanged. 

632 **kwargs : `dict` 

633 Any unused keyword arguments. 

634 """ 

635 # Do nothing if we have a standalone DataCoordinate. 

636 if isinstance(dataId, DataCoordinate) and not kwargs: 

637 return dataId, kwargs 

638 

639 # Process dimension records that are using record information 

640 # rather than ids 

641 newDataId: Dict[str, DataIdValue] = {} 

642 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

643 

644 # if all the dataId comes from keyword parameters we do not need 

645 # to do anything here because they can't be of the form 

646 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

647 if dataId: 

648 for k, v in dataId.items(): 

649 # If we have a Dimension we do not need to do anything 

650 # because it cannot be a compound key. 

651 if isinstance(k, str) and "." in k: 

652 # Someone is using a more human-readable dataId 

653 dimensionName, record = k.split(".", 1) 

654 byRecord[dimensionName][record] = v 

655 elif isinstance(k, Dimension): 

656 newDataId[k.name] = v 

657 else: 

658 newDataId[k] = v 

659 

660 # Go through the updated dataId and check the type in case someone is 

661 # using an alternate key. We have already filtered out the compound 

662 # keys dimensions.record format. 

663 not_dimensions = {} 

664 

665 # Will need to look in the dataId and the keyword arguments 

666 # and will remove them if they need to be fixed or are unrecognized. 

667 for dataIdDict in (newDataId, kwargs): 

668 # Use a list so we can adjust the dict safely in the loop 

669 for dimensionName in list(dataIdDict): 

670 value = dataIdDict[dimensionName] 

671 try: 

672 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

673 except KeyError: 

674 # This is not a real dimension 

675 not_dimensions[dimensionName] = value 

676 del dataIdDict[dimensionName] 

677 continue 

678 

679 # Convert an integral type to an explicit int to simplify 

680 # comparisons here 

681 if isinstance(value, numbers.Integral): 

682 value = int(value) 

683 

684 if not isinstance(value, dimension.primaryKey.getPythonType()): 

685 for alternate in dimension.alternateKeys: 

686 if isinstance(value, alternate.getPythonType()): 

687 byRecord[dimensionName][alternate.name] = value 

688 del dataIdDict[dimensionName] 

689 log.debug("Converting dimension %s to %s.%s=%s", 

690 dimensionName, dimensionName, alternate.name, value) 

691 break 

692 else: 

693 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

694 "Could not find matching alternative (primary key has type %s) " 

695 "so attempting to use as-is.", 

696 value, dimensionName, dimension.primaryKey.getPythonType()) 

697 

698 # If we have some unrecognized dimensions we have to try to connect 

699 # them to records in other dimensions. This is made more complicated 

700 # by some dimensions having records with clashing names. A mitigation 

701 # is that we can tell by this point which dimensions are missing 

702 # for the DatasetType but this does not work for calibrations 

703 # where additional dimensions can be used to constrain the temporal 

704 # axis. 

705 if not_dimensions: 

706 # Calculate missing dimensions 

707 provided = set(newDataId) | set(kwargs) | set(byRecord) 

708 missingDimensions = datasetType.dimensions.names - provided 

709 

710 # For calibrations we may well be needing temporal dimensions 

711 # so rather than always including all dimensions in the scan 

712 # restrict things a little. It is still possible for there 

713 # to be confusion over day_obs in visit vs exposure for example. 

714 # If we are not searching calibration collections things may 

715 # fail but they are going to fail anyway because of the 

716 # ambiguousness of the dataId... 

717 candidateDimensions: Set[str] = set() 

718 candidateDimensions.update(missingDimensions) 

719 if datasetType.isCalibration(): 

720 for dim in self.registry.dimensions.getStaticDimensions(): 

721 if dim.temporal: 

722 candidateDimensions.add(str(dim)) 

723 

724 # Look up table for the first association with a dimension 

725 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

726 

727 # Keep track of whether an item is associated with multiple 

728 # dimensions. 

729 counter: Counter[str] = Counter() 

730 assigned: Dict[str, Set[str]] = defaultdict(set) 

731 

732 # Go through the missing dimensions and associate the 

733 # given names with records within those dimensions 

734 matched_dims = set() 

735 for dimensionName in candidateDimensions: 

736 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

737 fields = dimension.metadata.names | dimension.uniqueKeys.names 

738 for field in not_dimensions: 

739 if field in fields: 

740 guessedAssociation[dimensionName][field] = not_dimensions[field] 

741 counter[dimensionName] += 1 

742 assigned[field].add(dimensionName) 

743 matched_dims.add(field) 

744 

745 # Calculate the fields that matched nothing. 

746 never_found = set(not_dimensions) - matched_dims 

747 

748 if never_found: 

749 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

750 

751 # There is a chance we have allocated a single dataId item 

752 # to multiple dimensions. Need to decide which should be retained. 

753 # For now assume that the most popular alternative wins. 

754 # This means that day_obs with seq_num will result in 

755 # exposure.day_obs and not visit.day_obs 

756 # Also prefer an explicitly missing dimension over an inferred 

757 # temporal dimension. 

758 for fieldName, assignedDimensions in assigned.items(): 

759 if len(assignedDimensions) > 1: 

760 # Pick the most popular (preferring mandatory dimensions) 

761 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

762 if requiredButMissing: 

763 candidateDimensions = requiredButMissing 

764 else: 

765 candidateDimensions = assignedDimensions 

766 

767 # Select the relevant items and get a new restricted 

768 # counter. 

769 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

770 duplicatesCounter: Counter[str] = Counter() 

771 duplicatesCounter.update(theseCounts) 

772 

773 # Choose the most common. If they are equally common 

774 # we will pick the one that was found first. 

775 # Returns a list of tuples 

776 selected = duplicatesCounter.most_common(1)[0][0] 

777 

778 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

779 " Removed ambiguity by choosing dimension %s.", 

780 fieldName, ", ".join(assignedDimensions), selected) 

781 

782 for candidateDimension in assignedDimensions: 

783 if candidateDimension != selected: 

784 del guessedAssociation[candidateDimension][fieldName] 

785 

786 # Update the record look up dict with the new associations 

787 for dimensionName, values in guessedAssociation.items(): 

788 if values: # A dict might now be empty 

789 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

790 dimensionName, values) 

791 byRecord[dimensionName].update(values) 

792 

793 if byRecord: 

794 # Some record specifiers were found so we need to convert 

795 # them to the Id form 

796 for dimensionName, values in byRecord.items(): 

797 if dimensionName in newDataId: 

798 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

799 " general record specifiers for it of %s. Ignoring record information.", 

800 dimensionName, newDataId[dimensionName], str(values)) 

801 continue 

802 

803 # Build up a WHERE expression 

804 bind = {k: v for k, v in values.items()} 

805 where = " AND ".join(f"{dimensionName}.{k} = {k}" 

806 for k in bind) 

807 

808 # Hopefully we get a single record that matches 

809 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

810 where=where, bind=bind, **kwargs)) 

811 

812 if len(records) != 1: 

813 if len(records) > 1: 

814 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

815 for r in records: 

816 log.debug("- %s", str(r)) 

817 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

818 f" uniquely constrained to a single dataset by {values}." 

819 f" Got {len(records)} results.") 

820 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

821 f" records when constrained by {values}") 

822 

823 # Get the primary key from the real dimension object 

824 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

825 if not isinstance(dimension, Dimension): 

826 raise RuntimeError( 

827 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

828 ) 

829 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

830 

831 # We have modified the dataId so need to switch to it 

832 dataId = newDataId 

833 

834 return dataId, kwargs 

835 

836 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

837 dataId: Optional[DataId] = None, *, 

838 collections: Any = None, 

839 allowUnresolved: bool = False, 

840 **kwargs: Any) -> DatasetRef: 

841 """Shared logic for methods that start with a search for a dataset in 

842 the registry. 

843 

844 Parameters 

845 ---------- 

846 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

847 When `DatasetRef` the `dataId` should be `None`. 

848 Otherwise the `DatasetType` or name thereof. 

849 dataId : `dict` or `DataCoordinate`, optional 

850 A `dict` of `Dimension` link name, value pairs that label the 

851 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

852 should be provided as the first argument. 

853 collections : Any, optional 

854 Collections to be searched, overriding ``self.collections``. 

855 Can be any of the types supported by the ``collections`` argument 

856 to butler construction. 

857 allowUnresolved : `bool`, optional 

858 If `True`, return an unresolved `DatasetRef` if finding a resolved 

859 one in the `Registry` fails. Defaults to `False`. 

860 **kwargs 

861 Additional keyword arguments used to augment or construct a 

862 `DataId`. See `DataId` parameters. 

863 

864 Returns 

865 ------- 

866 ref : `DatasetRef` 

867 A reference to the dataset identified by the given arguments. 

868 

869 Raises 

870 ------ 

871 LookupError 

872 Raised if no matching dataset exists in the `Registry` (and 

873 ``allowUnresolved is False``). 

874 ValueError 

875 Raised if a resolved `DatasetRef` was passed as an input, but it 

876 differs from the one found in the registry. 

877 TypeError 

878 Raised if no collections were provided. 

879 """ 

880 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

881 if isinstance(datasetRefOrType, DatasetRef): 

882 idNumber = datasetRefOrType.id 

883 else: 

884 idNumber = None 

885 timespan: Optional[Timespan] = None 

886 

887 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

888 

889 if datasetType.isCalibration(): 

890 # Because this is a calibration dataset, first try to make a 

891 # standardize the data ID without restricting the dimensions to 

892 # those of the dataset type requested, because there may be extra 

893 # dimensions that provide temporal information for a validity-range 

894 # lookup. 

895 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

896 defaults=self.registry.defaults.dataId, **kwargs) 

897 if dataId.graph.temporal: 

898 dataId = self.registry.expandDataId(dataId) 

899 timespan = dataId.timespan 

900 else: 

901 # Standardize the data ID to just the dimensions of the dataset 

902 # type instead of letting registry.findDataset do it, so we get the 

903 # result even if no dataset is found. 

904 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

905 defaults=self.registry.defaults.dataId, **kwargs) 

906 # Always lookup the DatasetRef, even if one is given, to ensure it is 

907 # present in the current collection. 

908 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

909 if ref is None: 

910 if allowUnresolved: 

911 return DatasetRef(datasetType, dataId) 

912 else: 

913 if collections is None: 

914 collections = self.registry.defaults.collections 

915 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

916 f"could not be found in collections {collections}.") 

917 if idNumber is not None and idNumber != ref.id: 

918 if collections is None: 

919 collections = self.registry.defaults.collections 

920 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

921 f"id ({ref.id}) in registry in collections {collections}.") 

922 return ref 

923 

924 @transactional 

925 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

926 dataId: Optional[DataId] = None, *, 

927 run: Optional[str] = None, 

928 **kwargs: Any) -> DatasetRef: 

929 """Store and register a dataset. 

930 

931 Parameters 

932 ---------- 

933 obj : `object` 

934 The dataset. 

935 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

936 When `DatasetRef` is provided, ``dataId`` should be `None`. 

937 Otherwise the `DatasetType` or name thereof. 

938 dataId : `dict` or `DataCoordinate` 

939 A `dict` of `Dimension` link name, value pairs that label the 

940 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

941 should be provided as the second argument. 

942 run : `str`, optional 

943 The name of the run the dataset should be added to, overriding 

944 ``self.run``. 

945 **kwargs 

946 Additional keyword arguments used to augment or construct a 

947 `DataCoordinate`. See `DataCoordinate.standardize` 

948 parameters. 

949 

950 Returns 

951 ------- 

952 ref : `DatasetRef` 

953 A reference to the stored dataset, updated with the correct id if 

954 given. 

955 

956 Raises 

957 ------ 

958 TypeError 

959 Raised if the butler is read-only or if no run has been provided. 

960 """ 

961 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

962 if not self.isWriteable(): 

963 raise TypeError("Butler is read-only.") 

964 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

965 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

966 raise ValueError("DatasetRef must not be in registry, must have None id") 

967 

968 # Handle dimension records in dataId 

969 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

970 

971 # Add Registry Dataset entry. 

972 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

973 

974 # For an execution butler the datasets will be pre-defined. 

975 # If the butler is configured that way datasets should only be inserted 

976 # if they do not already exist in registry. Trying and catching 

977 # ConflictingDefinitionError will not work because the transaction 

978 # will be corrupted. Instead, in this mode always check first. 

979 ref = None 

980 ref_is_predefined = False 

981 if self._allow_put_of_predefined_dataset: 

982 # Get the matching ref for this run. 

983 ref = self.registry.findDataset(datasetType, collections=run, 

984 dataId=dataId) 

985 

986 if ref: 

987 # Must be expanded form for datastore templating 

988 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

989 ref = ref.expanded(dataId) 

990 ref_is_predefined = True 

991 

992 if not ref: 

993 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

994 

995 # If the ref is predefined it is possible that the datastore also 

996 # has the record. Asking datastore to put it again will result in 

997 # the artifact being recreated, overwriting previous, then will cause 

998 # a failure in writing the record which will cause the artifact 

999 # to be removed. Much safer to ask first before attempting to 

1000 # overwrite. Race conditions should not be an issue for the 

1001 # execution butler environment. 

1002 if ref_is_predefined: 

1003 if self.datastore.knows(ref): 

1004 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1005 

1006 self.datastore.put(obj, ref) 

1007 

1008 return ref 

1009 

1010 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1011 """Retrieve a stored dataset. 

1012 

1013 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1014 collection to be read as long as the `DatasetRef` that identifies them 

1015 can be obtained separately. 

1016 

1017 Parameters 

1018 ---------- 

1019 ref : `DatasetRef` 

1020 Resolved reference to an already stored dataset. 

1021 parameters : `dict` 

1022 Additional StorageClass-defined options to control reading, 

1023 typically used to efficiently read only a subset of the dataset. 

1024 

1025 Returns 

1026 ------- 

1027 obj : `object` 

1028 The dataset. 

1029 """ 

1030 return self.datastore.get(ref, parameters=parameters) 

1031 

1032 def getDirectDeferred(self, ref: DatasetRef, *, 

1033 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

1034 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1035 from a resolved `DatasetRef`. 

1036 

1037 Parameters 

1038 ---------- 

1039 ref : `DatasetRef` 

1040 Resolved reference to an already stored dataset. 

1041 parameters : `dict` 

1042 Additional StorageClass-defined options to control reading, 

1043 typically used to efficiently read only a subset of the dataset. 

1044 

1045 Returns 

1046 ------- 

1047 obj : `DeferredDatasetHandle` 

1048 A handle which can be used to retrieve a dataset at a later time. 

1049 

1050 Raises 

1051 ------ 

1052 AmbiguousDatasetError 

1053 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1054 """ 

1055 if ref.id is None: 

1056 raise AmbiguousDatasetError( 

1057 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1058 ) 

1059 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1060 

1061 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1062 dataId: Optional[DataId] = None, *, 

1063 parameters: Union[dict, None] = None, 

1064 collections: Any = None, 

1065 **kwargs: Any) -> DeferredDatasetHandle: 

1066 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1067 after an immediate registry lookup. 

1068 

1069 Parameters 

1070 ---------- 

1071 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1072 When `DatasetRef` the `dataId` should be `None`. 

1073 Otherwise the `DatasetType` or name thereof. 

1074 dataId : `dict` or `DataCoordinate`, optional 

1075 A `dict` of `Dimension` link name, value pairs that label the 

1076 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1077 should be provided as the first argument. 

1078 parameters : `dict` 

1079 Additional StorageClass-defined options to control reading, 

1080 typically used to efficiently read only a subset of the dataset. 

1081 collections : Any, optional 

1082 Collections to be searched, overriding ``self.collections``. 

1083 Can be any of the types supported by the ``collections`` argument 

1084 to butler construction. 

1085 **kwargs 

1086 Additional keyword arguments used to augment or construct a 

1087 `DataId`. See `DataId` parameters. 

1088 

1089 Returns 

1090 ------- 

1091 obj : `DeferredDatasetHandle` 

1092 A handle which can be used to retrieve a dataset at a later time. 

1093 

1094 Raises 

1095 ------ 

1096 LookupError 

1097 Raised if no matching dataset exists in the `Registry` (and 

1098 ``allowUnresolved is False``). 

1099 ValueError 

1100 Raised if a resolved `DatasetRef` was passed as an input, but it 

1101 differs from the one found in the registry. 

1102 TypeError 

1103 Raised if no collections were provided. 

1104 """ 

1105 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1106 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1107 

1108 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1109 dataId: Optional[DataId] = None, *, 

1110 parameters: Optional[Dict[str, Any]] = None, 

1111 collections: Any = None, 

1112 **kwargs: Any) -> Any: 

1113 """Retrieve a stored dataset. 

1114 

1115 Parameters 

1116 ---------- 

1117 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1118 When `DatasetRef` the `dataId` should be `None`. 

1119 Otherwise the `DatasetType` or name thereof. 

1120 dataId : `dict` or `DataCoordinate` 

1121 A `dict` of `Dimension` link name, value pairs that label the 

1122 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1123 should be provided as the first argument. 

1124 parameters : `dict` 

1125 Additional StorageClass-defined options to control reading, 

1126 typically used to efficiently read only a subset of the dataset. 

1127 collections : Any, optional 

1128 Collections to be searched, overriding ``self.collections``. 

1129 Can be any of the types supported by the ``collections`` argument 

1130 to butler construction. 

1131 **kwargs 

1132 Additional keyword arguments used to augment or construct a 

1133 `DataCoordinate`. See `DataCoordinate.standardize` 

1134 parameters. 

1135 

1136 Returns 

1137 ------- 

1138 obj : `object` 

1139 The dataset. 

1140 

1141 Raises 

1142 ------ 

1143 ValueError 

1144 Raised if a resolved `DatasetRef` was passed as an input, but it 

1145 differs from the one found in the registry. 

1146 LookupError 

1147 Raised if no matching dataset exists in the `Registry`. 

1148 TypeError 

1149 Raised if no collections were provided. 

1150 

1151 Notes 

1152 ----- 

1153 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1154 this method requires that the given data ID include temporal dimensions 

1155 beyond the dimensions of the dataset type itself, in order to find the 

1156 dataset with the appropriate validity range. For example, a "bias" 

1157 dataset with native dimensions ``{instrument, detector}`` could be 

1158 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1159 ``exposure`` is a temporal dimension. 

1160 """ 

1161 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1162 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1163 return self.getDirect(ref, parameters=parameters) 

1164 

1165 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1166 dataId: Optional[DataId] = None, *, 

1167 predict: bool = False, 

1168 collections: Any = None, 

1169 run: Optional[str] = None, 

1170 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1171 """Returns the URIs associated with the dataset. 

1172 

1173 Parameters 

1174 ---------- 

1175 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1176 When `DatasetRef` the `dataId` should be `None`. 

1177 Otherwise the `DatasetType` or name thereof. 

1178 dataId : `dict` or `DataCoordinate` 

1179 A `dict` of `Dimension` link name, value pairs that label the 

1180 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1181 should be provided as the first argument. 

1182 predict : `bool` 

1183 If `True`, allow URIs to be returned of datasets that have not 

1184 been written. 

1185 collections : Any, optional 

1186 Collections to be searched, overriding ``self.collections``. 

1187 Can be any of the types supported by the ``collections`` argument 

1188 to butler construction. 

1189 run : `str`, optional 

1190 Run to use for predictions, overriding ``self.run``. 

1191 **kwargs 

1192 Additional keyword arguments used to augment or construct a 

1193 `DataCoordinate`. See `DataCoordinate.standardize` 

1194 parameters. 

1195 

1196 Returns 

1197 ------- 

1198 primary : `ButlerURI` 

1199 The URI to the primary artifact associated with this dataset. 

1200 If the dataset was disassembled within the datastore this 

1201 may be `None`. 

1202 components : `dict` 

1203 URIs to any components associated with the dataset artifact. 

1204 Can be empty if there are no components. 

1205 """ 

1206 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1207 collections=collections, **kwargs) 

1208 if ref.id is None: # only possible if predict is True 

1209 if run is None: 

1210 run = self.run 

1211 if run is None: 

1212 raise TypeError("Cannot predict location with run=None.") 

1213 # Lie about ID, because we can't guess it, and only 

1214 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1215 ref = ref.resolved(id=0, run=run) 

1216 return self.datastore.getURIs(ref, predict) 

1217 

1218 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1219 dataId: Optional[DataId] = None, *, 

1220 predict: bool = False, 

1221 collections: Any = None, 

1222 run: Optional[str] = None, 

1223 **kwargs: Any) -> ButlerURI: 

1224 """Return the URI to the Dataset. 

1225 

1226 Parameters 

1227 ---------- 

1228 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1229 When `DatasetRef` the `dataId` should be `None`. 

1230 Otherwise the `DatasetType` or name thereof. 

1231 dataId : `dict` or `DataCoordinate` 

1232 A `dict` of `Dimension` link name, value pairs that label the 

1233 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1234 should be provided as the first argument. 

1235 predict : `bool` 

1236 If `True`, allow URIs to be returned of datasets that have not 

1237 been written. 

1238 collections : Any, optional 

1239 Collections to be searched, overriding ``self.collections``. 

1240 Can be any of the types supported by the ``collections`` argument 

1241 to butler construction. 

1242 run : `str`, optional 

1243 Run to use for predictions, overriding ``self.run``. 

1244 **kwargs 

1245 Additional keyword arguments used to augment or construct a 

1246 `DataCoordinate`. See `DataCoordinate.standardize` 

1247 parameters. 

1248 

1249 Returns 

1250 ------- 

1251 uri : `ButlerURI` 

1252 URI pointing to the Dataset within the datastore. If the 

1253 Dataset does not exist in the datastore, and if ``predict`` is 

1254 `True`, the URI will be a prediction and will include a URI 

1255 fragment "#predicted". 

1256 If the datastore does not have entities that relate well 

1257 to the concept of a URI the returned URI string will be 

1258 descriptive. The returned URI is not guaranteed to be obtainable. 

1259 

1260 Raises 

1261 ------ 

1262 LookupError 

1263 A URI has been requested for a dataset that does not exist and 

1264 guessing is not allowed. 

1265 ValueError 

1266 Raised if a resolved `DatasetRef` was passed as an input, but it 

1267 differs from the one found in the registry. 

1268 TypeError 

1269 Raised if no collections were provided. 

1270 RuntimeError 

1271 Raised if a URI is requested for a dataset that consists of 

1272 multiple artifacts. 

1273 """ 

1274 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1275 collections=collections, run=run, **kwargs) 

1276 

1277 if primary is None or components: 

1278 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1279 "Use Butler.getURIs() instead.") 

1280 return primary 

1281 

1282 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1283 destination: Union[str, ButlerURI], transfer: str = "auto", 

1284 preserve_path: bool = True, 

1285 overwrite: bool = False) -> List[ButlerURI]: 

1286 """Retrieve the artifacts associated with the supplied refs. 

1287 

1288 Parameters 

1289 ---------- 

1290 refs : iterable of `DatasetRef` 

1291 The datasets for which artifacts are to be retrieved. 

1292 A single ref can result in multiple artifacts. The refs must 

1293 be resolved. 

1294 destination : `ButlerURI` or `str` 

1295 Location to write the artifacts. 

1296 transfer : `str`, optional 

1297 Method to use to transfer the artifacts. Must be one of the options 

1298 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1299 preserve_path : `bool`, optional 

1300 If `True` the full path of the artifact within the datastore 

1301 is preserved. If `False` the final file component of the path 

1302 is used. 

1303 overwrite : `bool`, optional 

1304 If `True` allow transfers to overwrite existing files at the 

1305 destination. 

1306 

1307 Returns 

1308 ------- 

1309 targets : `list` of `ButlerURI` 

1310 URIs of file artifacts in destination location. Order is not 

1311 preserved. 

1312 

1313 Notes 

1314 ----- 

1315 For non-file datastores the artifacts written to the destination 

1316 may not match the representation inside the datastore. For example 

1317 a hierarchical data structure in a NoSQL database may well be stored 

1318 as a JSON file. 

1319 """ 

1320 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer, 

1321 preserve_path=preserve_path, overwrite=overwrite) 

1322 

1323 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1324 dataId: Optional[DataId] = None, *, 

1325 collections: Any = None, 

1326 **kwargs: Any) -> bool: 

1327 """Return True if the Dataset is actually present in the Datastore. 

1328 

1329 Parameters 

1330 ---------- 

1331 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1332 When `DatasetRef` the `dataId` should be `None`. 

1333 Otherwise the `DatasetType` or name thereof. 

1334 dataId : `dict` or `DataCoordinate` 

1335 A `dict` of `Dimension` link name, value pairs that label the 

1336 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1337 should be provided as the first argument. 

1338 collections : Any, optional 

1339 Collections to be searched, overriding ``self.collections``. 

1340 Can be any of the types supported by the ``collections`` argument 

1341 to butler construction. 

1342 **kwargs 

1343 Additional keyword arguments used to augment or construct a 

1344 `DataCoordinate`. See `DataCoordinate.standardize` 

1345 parameters. 

1346 

1347 Raises 

1348 ------ 

1349 LookupError 

1350 Raised if the dataset is not even present in the Registry. 

1351 ValueError 

1352 Raised if a resolved `DatasetRef` was passed as an input, but it 

1353 differs from the one found in the registry. 

1354 TypeError 

1355 Raised if no collections were provided. 

1356 """ 

1357 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1358 return self.datastore.exists(ref) 

1359 

1360 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1361 """Remove one or more `~CollectionType.RUN` collections and the 

1362 datasets within them. 

1363 

1364 Parameters 

1365 ---------- 

1366 names : `Iterable` [ `str` ] 

1367 The names of the collections to remove. 

1368 unstore : `bool`, optional 

1369 If `True` (default), delete datasets from all datastores in which 

1370 they are present, and attempt to rollback the registry deletions if 

1371 datastore deletions fail (which may not always be possible). If 

1372 `False`, datastore records for these datasets are still removed, 

1373 but any artifacts (e.g. files) will not be. 

1374 

1375 Raises 

1376 ------ 

1377 TypeError 

1378 Raised if one or more collections are not of type 

1379 `~CollectionType.RUN`. 

1380 """ 

1381 if not self.isWriteable(): 

1382 raise TypeError("Butler is read-only.") 

1383 names = list(names) 

1384 refs: List[DatasetRef] = [] 

1385 for name in names: 

1386 collectionType = self.registry.getCollectionType(name) 

1387 if collectionType is not CollectionType.RUN: 

1388 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1389 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1390 with self.registry.transaction(): 

1391 if unstore: 

1392 self.datastore.trash(refs) 

1393 else: 

1394 self.datastore.forget(refs) 

1395 for name in names: 

1396 self.registry.removeCollection(name) 

1397 if unstore: 

1398 # Point of no return for removing artifacts 

1399 self.datastore.emptyTrash() 

1400 

1401 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False, 

1402 unlink: Optional[List[str]] = None) -> None: 

1403 """Remove a collection and possibly prune datasets within it. 

1404 

1405 Parameters 

1406 ---------- 

1407 name : `str` 

1408 Name of the collection to remove. If this is a 

1409 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1410 datasets within the collection are not modified unless ``unstore`` 

1411 is `True`. If this is a `~CollectionType.RUN` collection, 

1412 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1413 are fully removed from the data repository. 

1414 purge : `bool`, optional 

1415 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1416 fully removing datasets within them. Requires ``unstore=True`` as 

1417 well as an added precaution against accidental deletion. Must be 

1418 `False` (default) if the collection is not a ``RUN``. 

1419 unstore: `bool`, optional 

1420 If `True`, remove all datasets in the collection from all 

1421 datastores in which they appear. 

1422 unlink: `list` [`str`], optional 

1423 Before removing the given `collection` unlink it from from these 

1424 parent collections. 

1425 

1426 Raises 

1427 ------ 

1428 TypeError 

1429 Raised if the butler is read-only or arguments are mutually 

1430 inconsistent. 

1431 """ 

1432 # See pruneDatasets comments for more information about the logic here; 

1433 # the cases are almost the same, but here we can rely on Registry to 

1434 # take care everything but Datastore deletion when we remove the 

1435 # collection. 

1436 if not self.isWriteable(): 

1437 raise TypeError("Butler is read-only.") 

1438 collectionType = self.registry.getCollectionType(name) 

1439 if purge and not unstore: 

1440 raise PurgeWithoutUnstorePruneCollectionsError() 

1441 if collectionType is CollectionType.RUN and not purge: 

1442 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1443 if collectionType is not CollectionType.RUN and purge: 

1444 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1445 

1446 def remove(child: str, parent: str) -> None: 

1447 """Remove a child collection from a parent collection.""" 

1448 # Remove child from parent. 

1449 chain = list(self.registry.getCollectionChain(parent)) 

1450 try: 

1451 chain.remove(name) 

1452 except ValueError as e: 

1453 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1454 self.registry.setCollectionChain(parent, chain) 

1455 

1456 with self.registry.transaction(): 

1457 if (unlink): 

1458 for parent in unlink: 

1459 remove(name, parent) 

1460 if unstore: 

1461 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1462 self.datastore.trash(refs) 

1463 self.registry.removeCollection(name) 

1464 

1465 if unstore: 

1466 # Point of no return for removing artifacts 

1467 self.datastore.emptyTrash() 

1468 

1469 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1470 disassociate: bool = True, 

1471 unstore: bool = False, 

1472 tags: Iterable[str] = (), 

1473 purge: bool = False, 

1474 run: Optional[str] = None) -> None: 

1475 """Remove one or more datasets from a collection and/or storage. 

1476 

1477 Parameters 

1478 ---------- 

1479 refs : `~collections.abc.Iterable` of `DatasetRef` 

1480 Datasets to prune. These must be "resolved" references (not just 

1481 a `DatasetType` and data ID). 

1482 disassociate : `bool`, optional 

1483 Disassociate pruned datasets from ``tags``, or from all collections 

1484 if ``purge=True``. 

1485 unstore : `bool`, optional 

1486 If `True` (`False` is default) remove these datasets from all 

1487 datastores known to this butler. Note that this will make it 

1488 impossible to retrieve these datasets even via other collections. 

1489 Datasets that are already not stored are ignored by this option. 

1490 tags : `Iterable` [ `str` ], optional 

1491 `~CollectionType.TAGGED` collections to disassociate the datasets 

1492 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1493 `True`. 

1494 purge : `bool`, optional 

1495 If `True` (`False` is default), completely remove the dataset from 

1496 the `Registry`. To prevent accidental deletions, ``purge`` may 

1497 only be `True` if all of the following conditions are met: 

1498 

1499 - All given datasets are in the given run. 

1500 - ``disassociate`` is `True`; 

1501 - ``unstore`` is `True`. 

1502 

1503 This mode may remove provenance information from datasets other 

1504 than those provided, and should be used with extreme care. 

1505 

1506 Raises 

1507 ------ 

1508 TypeError 

1509 Raised if the butler is read-only, if no collection was provided, 

1510 or the conditions for ``purge=True`` were not met. 

1511 """ 

1512 if not self.isWriteable(): 

1513 raise TypeError("Butler is read-only.") 

1514 if purge: 

1515 if not disassociate: 

1516 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1517 if not unstore: 

1518 raise TypeError("Cannot pass purge=True without unstore=True.") 

1519 elif disassociate: 

1520 tags = tuple(tags) 

1521 if not tags: 

1522 raise TypeError("No tags provided but disassociate=True.") 

1523 for tag in tags: 

1524 collectionType = self.registry.getCollectionType(tag) 

1525 if collectionType is not CollectionType.TAGGED: 

1526 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1527 f"of non-TAGGED type {collectionType.name}.") 

1528 # Transform possibly-single-pass iterable into something we can iterate 

1529 # over multiple times. 

1530 refs = list(refs) 

1531 # Pruning a component of a DatasetRef makes no sense since registry 

1532 # doesn't know about components and datastore might not store 

1533 # components in a separate file 

1534 for ref in refs: 

1535 if ref.datasetType.component(): 

1536 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1537 # We don't need an unreliable Datastore transaction for this, because 

1538 # we've been extra careful to ensure that Datastore.trash only involves 

1539 # mutating the Registry (it can _look_ at Datastore-specific things, 

1540 # but shouldn't change them), and hence all operations here are 

1541 # Registry operations. 

1542 with self.registry.transaction(): 

1543 if unstore: 

1544 self.datastore.trash(refs) 

1545 if purge: 

1546 self.registry.removeDatasets(refs) 

1547 elif disassociate: 

1548 assert tags, "Guaranteed by earlier logic in this function." 

1549 for tag in tags: 

1550 self.registry.disassociate(tag, refs) 

1551 # We've exited the Registry transaction, and apparently committed. 

1552 # (if there was an exception, everything rolled back, and it's as if 

1553 # nothing happened - and we never get here). 

1554 # Datastore artifacts are not yet gone, but they're clearly marked 

1555 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1556 # problems we can try again later, and if manual administrative 

1557 # intervention is required, it's pretty clear what that should entail: 

1558 # deleting everything on disk and in private Datastore tables that is 

1559 # in the dataset_location_trash table. 

1560 if unstore: 

1561 # Point of no return for removing artifacts 

1562 self.datastore.emptyTrash() 

1563 

1564 @transactional 

1565 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1566 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1567 ) -> None: 

1568 """Store and register one or more datasets that already exist on disk. 

1569 

1570 Parameters 

1571 ---------- 

1572 datasets : `FileDataset` 

1573 Each positional argument is a struct containing information about 

1574 a file to be ingested, including its URI (either absolute or 

1575 relative to the datastore root, if applicable), a `DatasetRef`, 

1576 and optionally a formatter class or its fully-qualified string 

1577 name. If a formatter is not provided, the formatter that would be 

1578 used for `put` is assumed. On successful return, all 

1579 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1580 attribute populated and all `FileDataset.formatter` attributes will 

1581 be set to the formatter class used. `FileDataset.path` attributes 

1582 may be modified to put paths in whatever the datastore considers a 

1583 standardized form. 

1584 transfer : `str`, optional 

1585 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1586 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1587 transfer the file. 

1588 run : `str`, optional 

1589 The name of the run ingested datasets should be added to, 

1590 overriding ``self.run``. 

1591 idGenerationMode : `DatasetIdGenEnum`, optional 

1592 Specifies option for generating dataset IDs. By default unique IDs 

1593 are generated for each inserted dataset. 

1594 

1595 Raises 

1596 ------ 

1597 TypeError 

1598 Raised if the butler is read-only or if no run was provided. 

1599 NotImplementedError 

1600 Raised if the `Datastore` does not support the given transfer mode. 

1601 DatasetTypeNotSupportedError 

1602 Raised if one or more files to be ingested have a dataset type that 

1603 is not supported by the `Datastore`.. 

1604 FileNotFoundError 

1605 Raised if one of the given files does not exist. 

1606 FileExistsError 

1607 Raised if transfer is not `None` but the (internal) location the 

1608 file would be moved to is already occupied. 

1609 

1610 Notes 

1611 ----- 

1612 This operation is not fully exception safe: if a database operation 

1613 fails, the given `FileDataset` instances may be only partially updated. 

1614 

1615 It is atomic in terms of database operations (they will either all 

1616 succeed or all fail) providing the database engine implements 

1617 transactions correctly. It will attempt to be atomic in terms of 

1618 filesystem operations as well, but this cannot be implemented 

1619 rigorously for most datastores. 

1620 """ 

1621 if not self.isWriteable(): 

1622 raise TypeError("Butler is read-only.") 

1623 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1624 # Reorganize the inputs so they're grouped by DatasetType and then 

1625 # data ID. We also include a list of DatasetRefs for each FileDataset 

1626 # to hold the resolved DatasetRefs returned by the Registry, before 

1627 # it's safe to swap them into FileDataset.refs. 

1628 # Some type annotation aliases to make that clearer: 

1629 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1630 GroupedData = MutableMapping[DatasetType, GroupForType] 

1631 # The actual data structure: 

1632 groupedData: GroupedData = defaultdict(dict) 

1633 # And the nested loop that populates it: 

1634 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1635 # This list intentionally shared across the inner loop, since it's 

1636 # associated with `dataset`. 

1637 resolvedRefs: List[DatasetRef] = [] 

1638 

1639 # Somewhere to store pre-existing refs if we have an 

1640 # execution butler. 

1641 existingRefs: List[DatasetRef] = [] 

1642 

1643 for ref in dataset.refs: 

1644 if ref.dataId in groupedData[ref.datasetType]: 

1645 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same" 

1646 " DataId as other ingest dataset" 

1647 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1648 f" ({ref.dataId})") 

1649 if self._allow_put_of_predefined_dataset: 

1650 existing_ref = self.registry.findDataset(ref.datasetType, 

1651 dataId=ref.dataId, 

1652 collections=run) 

1653 if existing_ref: 

1654 if self.datastore.knows(existing_ref): 

1655 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}" 

1656 f" already exists as {existing_ref}.") 

1657 # Store this ref elsewhere since it already exists 

1658 # and we do not want to remake it but we do want 

1659 # to store it in the datastore. 

1660 existingRefs.append(existing_ref) 

1661 

1662 # Nothing else to do until we have finished 

1663 # iterating. 

1664 continue 

1665 

1666 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1667 

1668 if existingRefs: 

1669 

1670 if len(dataset.refs) != len(existingRefs): 

1671 # Keeping track of partially pre-existing datasets is hard 

1672 # and should generally never happen. For now don't allow 

1673 # it. 

1674 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist" 

1675 " in registry but others do not. This is not supported.") 

1676 

1677 # Attach the resolved refs if we found them. 

1678 dataset.refs = existingRefs 

1679 

1680 # Now we can bulk-insert into Registry for each DatasetType. 

1681 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(), 

1682 desc="Bulk-inserting datasets by type"): 

1683 refs = self.registry.insertDatasets( 

1684 datasetType, 

1685 dataIds=groupForType.keys(), 

1686 run=run, 

1687 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1688 idGenerationMode=idGenerationMode, 

1689 ) 

1690 # Append those resolved DatasetRefs to the new lists we set up for 

1691 # them. 

1692 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1693 resolvedRefs.append(ref) 

1694 

1695 # Go back to the original FileDatasets to replace their refs with the 

1696 # new resolved ones. 

1697 for groupForType in progress.iter_chunks(groupedData.values(), 

1698 desc="Reassociating resolved dataset refs with files"): 

1699 for dataset, resolvedRefs in groupForType.values(): 

1700 dataset.refs = resolvedRefs 

1701 

1702 # Bulk-insert everything into Datastore. 

1703 self.datastore.ingest(*datasets, transfer=transfer) 

1704 

1705 @contextlib.contextmanager 

1706 def export(self, *, directory: Optional[str] = None, 

1707 filename: Optional[str] = None, 

1708 format: Optional[str] = None, 

1709 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1710 """Export datasets from the repository represented by this `Butler`. 

1711 

1712 This method is a context manager that returns a helper object 

1713 (`RepoExportContext`) that is used to indicate what information from 

1714 the repository should be exported. 

1715 

1716 Parameters 

1717 ---------- 

1718 directory : `str`, optional 

1719 Directory dataset files should be written to if ``transfer`` is not 

1720 `None`. 

1721 filename : `str`, optional 

1722 Name for the file that will include database information associated 

1723 with the exported datasets. If this is not an absolute path and 

1724 ``directory`` is not `None`, it will be written to ``directory`` 

1725 instead of the current working directory. Defaults to 

1726 "export.{format}". 

1727 format : `str`, optional 

1728 File format for the database information file. If `None`, the 

1729 extension of ``filename`` will be used. 

1730 transfer : `str`, optional 

1731 Transfer mode passed to `Datastore.export`. 

1732 

1733 Raises 

1734 ------ 

1735 TypeError 

1736 Raised if the set of arguments passed is inconsistent. 

1737 

1738 Examples 

1739 -------- 

1740 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1741 methods are used to provide the iterables over data IDs and/or datasets 

1742 to be exported:: 

1743 

1744 with butler.export("exports.yaml") as export: 

1745 # Export all flats, but none of the dimension element rows 

1746 # (i.e. data ID information) associated with them. 

1747 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1748 elements=()) 

1749 # Export all datasets that start with "deepCoadd_" and all of 

1750 # their associated data ID information. 

1751 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1752 """ 

1753 if directory is None and transfer is not None: 

1754 raise TypeError("Cannot transfer without providing a directory.") 

1755 if transfer == "move": 

1756 raise TypeError("Transfer may not be 'move': export is read-only") 

1757 if format is None: 

1758 if filename is None: 

1759 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1760 else: 

1761 _, format = os.path.splitext(filename) 

1762 elif filename is None: 

1763 filename = f"export.{format}" 

1764 if directory is not None: 

1765 filename = os.path.join(directory, filename) 

1766 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

1767 with open(filename, 'w') as stream: 

1768 backend = BackendClass(stream) 

1769 try: 

1770 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1771 directory=directory, transfer=transfer) 

1772 yield helper 

1773 except BaseException: 

1774 raise 

1775 else: 

1776 helper._finish() 

1777 

1778 def import_(self, *, directory: Optional[str] = None, 

1779 filename: Union[str, TextIO, None] = None, 

1780 format: Optional[str] = None, 

1781 transfer: Optional[str] = None, 

1782 skip_dimensions: Optional[Set] = None, 

1783 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1784 reuseIds: bool = False) -> None: 

1785 """Import datasets into this repository that were exported from a 

1786 different butler repository via `~lsst.daf.butler.Butler.export`. 

1787 

1788 Parameters 

1789 ---------- 

1790 directory : `str`, optional 

1791 Directory containing dataset files to import from. If `None`, 

1792 ``filename`` and all dataset file paths specified therein must 

1793 be absolute. 

1794 filename : `str` or `TextIO`, optional 

1795 A stream or name of file that contains database information 

1796 associated with the exported datasets, typically generated by 

1797 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1798 is not an absolute path, does not exist in the current working 

1799 directory, and ``directory`` is not `None`, it is assumed to be in 

1800 ``directory``. Defaults to "export.{format}". 

1801 format : `str`, optional 

1802 File format for ``filename``. If `None`, the extension of 

1803 ``filename`` will be used. 

1804 transfer : `str`, optional 

1805 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1806 skip_dimensions : `set`, optional 

1807 Names of dimensions that should be skipped and not imported. 

1808 idGenerationMode : `DatasetIdGenEnum`, optional 

1809 Specifies option for generating dataset IDs when IDs are not 

1810 provided or their type does not match backend type. By default 

1811 unique IDs are generated for each inserted dataset. 

1812 reuseIds : `bool`, optional 

1813 If `True` then forces re-use of imported dataset IDs for integer 

1814 IDs which are normally generated as auto-incremented; exception 

1815 will be raised if imported IDs clash with existing ones. This 

1816 option has no effect on the use of globally-unique IDs which are 

1817 always re-used (or generated if integer IDs are being imported). 

1818 

1819 Raises 

1820 ------ 

1821 TypeError 

1822 Raised if the set of arguments passed is inconsistent, or if the 

1823 butler is read-only. 

1824 """ 

1825 if not self.isWriteable(): 

1826 raise TypeError("Butler is read-only.") 

1827 if format is None: 

1828 if filename is None: 

1829 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1830 else: 

1831 _, format = os.path.splitext(filename) # type: ignore 

1832 elif filename is None: 

1833 filename = f"export.{format}" 

1834 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1835 filename = os.path.join(directory, filename) 

1836 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

1837 

1838 def doImport(importStream: TextIO) -> None: 

1839 backend = BackendClass(importStream, self.registry) 

1840 backend.register() 

1841 with self.transaction(): 

1842 backend.load(self.datastore, directory=directory, transfer=transfer, 

1843 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode, 

1844 reuseIds=reuseIds) 

1845 

1846 if isinstance(filename, str): 

1847 with open(filename, "r") as stream: 

1848 doImport(stream) 

1849 else: 

1850 doImport(filename) 

1851 

1852 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef], 

1853 transfer: str = "auto", 

1854 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

1855 skip_missing: bool = True, 

1856 register_dataset_types: bool = False) -> List[DatasetRef]: 

1857 """Transfer datasets to this Butler from a run in another Butler. 

1858 

1859 Parameters 

1860 ---------- 

1861 source_butler : `Butler` 

1862 Butler from which the datasets are to be transferred. 

1863 source_refs : iterable of `DatasetRef` 

1864 Datasets defined in the source butler that should be transferred to 

1865 this butler. 

1866 transfer : `str`, optional 

1867 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

1868 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

1869 A mapping of dataset type to ID generation mode. Only used if 

1870 the source butler is using integer IDs. Should not be used 

1871 if this receiving butler uses integer IDs. Without this dataset 

1872 import always uses unique. 

1873 skip_missing : `bool` 

1874 If `True`, datasets with no datastore artifact associated with 

1875 them are not transferred. If `False` a registry entry will be 

1876 created even if no datastore record is created (and so will 

1877 look equivalent to the dataset being unstored). 

1878 register_dataset_types : `bool` 

1879 If `True` any missing dataset types are registered. Otherwise 

1880 an exception is raised. 

1881 

1882 Returns 

1883 ------- 

1884 refs : `list` of `DatasetRef` 

1885 The refs added to this Butler. 

1886 

1887 Notes 

1888 ----- 

1889 Requires that any dimension definitions are already present in the 

1890 receiving Butler. The datastore artifact has to exist for a transfer 

1891 to be made but non-existence is not an error. 

1892 

1893 Datasets that already exist in this run will be skipped. 

1894 

1895 The datasets are imported as part of a transaction, although 

1896 dataset types are registered before the transaction is started. 

1897 This means that it is possible for a dataset type to be registered 

1898 even though transfer has failed. 

1899 """ 

1900 if not self.isWriteable(): 

1901 raise TypeError("Butler is read-only.") 

1902 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1903 

1904 # Will iterate through the refs multiple times so need to convert 

1905 # to a list if this isn't a collection. 

1906 if not isinstance(source_refs, collections.abc.Collection): 

1907 source_refs = list(source_refs) 

1908 

1909 original_count = len(source_refs) 

1910 log.info("Transferring %d datasets into %s", original_count, str(self)) 

1911 

1912 if id_gen_map is None: 

1913 id_gen_map = {} 

1914 

1915 # In some situations the datastore artifact may be missing 

1916 # and we do not want that registry entry to be imported. 

1917 # Asking datastore is not sufficient, the records may have been 

1918 # purged, we have to ask for the (predicted) URI and check 

1919 # existence explicitly. Execution butler is set up exactly like 

1920 # this with no datastore records. 

1921 artifact_existence: Dict[ButlerURI, bool] = {} 

1922 if skip_missing: 

1923 dataset_existence = source_butler.datastore.mexists(source_refs, 

1924 artifact_existence=artifact_existence) 

1925 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1926 filtered_count = len(source_refs) 

1927 log.verbose("%d datasets removed because the artifact does not exist. Now have %d.", 

1928 original_count - filtered_count, filtered_count) 

1929 

1930 # Importing requires that we group the refs by dataset type and run 

1931 # before doing the import. 

1932 source_dataset_types = set() 

1933 grouped_refs = defaultdict(list) 

1934 grouped_indices = defaultdict(list) 

1935 for i, ref in enumerate(source_refs): 

1936 grouped_refs[ref.datasetType, ref.run].append(ref) 

1937 grouped_indices[ref.datasetType, ref.run].append(i) 

1938 source_dataset_types.add(ref.datasetType) 

1939 

1940 # Check to see if the dataset type in the source butler has 

1941 # the same definition in the target butler and register missing 

1942 # ones if requested. Registration must happen outside a transaction. 

1943 newly_registered_dataset_types = set() 

1944 for datasetType in source_dataset_types: 

1945 if register_dataset_types: 

1946 # Let this raise immediately if inconsistent. Continuing 

1947 # on to find additional inconsistent dataset types 

1948 # might result in additional unwanted dataset types being 

1949 # registered. 

1950 if self.registry.registerDatasetType(datasetType): 

1951 newly_registered_dataset_types.add(datasetType) 

1952 else: 

1953 # If the dataset type is missing, let it fail immediately. 

1954 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

1955 if target_dataset_type != datasetType: 

1956 raise ConflictingDefinitionError("Source butler dataset type differs from definition" 

1957 f" in target butler: {datasetType} !=" 

1958 f" {target_dataset_type}") 

1959 if newly_registered_dataset_types: 

1960 # We may have registered some even if there were inconsistencies 

1961 # but should let people know (or else remove them again). 

1962 log.log(VERBOSE, "Registered the following dataset types in the target Butler: %s", 

1963 ", ".join(d.name for d in newly_registered_dataset_types)) 

1964 else: 

1965 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

1966 

1967 # The returned refs should be identical for UUIDs. 

1968 # For now must also support integers and so need to retain the 

1969 # newly-created refs from this registry. 

1970 # Pre-size it so we can assign refs into the correct slots 

1971 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

1972 default_id_gen = DatasetIdGenEnum.UNIQUE 

1973 

1974 handled_collections: Set[str] = set() 

1975 

1976 # Do all the importing in a single transaction. 

1977 with self.transaction(): 

1978 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(), 

1979 desc="Importing to registry" 

1980 " by run and dataset type"): 

1981 if run not in handled_collections: 

1982 run_doc = source_butler.registry.getCollectionDocumentation(run) 

1983 registered = self.registry.registerRun(run, doc=run_doc) 

1984 handled_collections.add(run) 

1985 if registered: 

1986 log.log(VERBOSE, "Creating output run %s", run) 

1987 

1988 id_generation_mode = default_id_gen 

1989 if isinstance(refs_to_import[0].id, int): 

1990 # ID generation mode might need to be overridden when 

1991 # targetting UUID 

1992 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

1993 

1994 n_refs = len(refs_to_import) 

1995 log.verbose("Importing %d ref%s of dataset type %s into run %s", 

1996 n_refs, "" if n_refs == 1 else "s", datasetType.name, run) 

1997 

1998 # No way to know if this butler's registry uses UUID. 

1999 # We have to trust the caller on this. If it fails they will 

2000 # have to change their approach. We can't catch the exception 

2001 # and retry with unique because that will mess up the 

2002 # transaction handling. We aren't allowed to ask the registry 

2003 # manager what type of ID it is using. 

2004 imported_refs = self.registry._importDatasets(refs_to_import, 

2005 idGenerationMode=id_generation_mode, 

2006 expand=False) 

2007 

2008 # Map them into the correct slots to match the initial order 

2009 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2010 transferred_refs_tmp[i] = ref 

2011 

2012 # Mypy insists that we might have None in here so we have to make 

2013 # that explicit by assigning to a new variable and filtering out 

2014 # something that won't be there. 

2015 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2016 

2017 # Check consistency 

2018 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2019 

2020 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2021 

2022 # The transferred refs need to be reordered to match the original 

2023 # ordering given by the caller. Without this the datastore transfer 

2024 # will be broken. 

2025 

2026 # Ask the datastore to transfer. The datastore has to check that 

2027 # the source datastore is compatible with the target datastore. 

2028 self.datastore.transfer_from(source_butler.datastore, source_refs, 

2029 local_refs=transferred_refs, transfer=transfer, 

2030 artifact_existence=artifact_existence) 

2031 

2032 return transferred_refs 

2033 

2034 def validateConfiguration(self, logFailures: bool = False, 

2035 datasetTypeNames: Optional[Iterable[str]] = None, 

2036 ignore: Iterable[str] = None) -> None: 

2037 """Validate butler configuration. 

2038 

2039 Checks that each `DatasetType` can be stored in the `Datastore`. 

2040 

2041 Parameters 

2042 ---------- 

2043 logFailures : `bool`, optional 

2044 If `True`, output a log message for every validation error 

2045 detected. 

2046 datasetTypeNames : iterable of `str`, optional 

2047 The `DatasetType` names that should be checked. This allows 

2048 only a subset to be selected. 

2049 ignore : iterable of `str`, optional 

2050 Names of DatasetTypes to skip over. This can be used to skip 

2051 known problems. If a named `DatasetType` corresponds to a 

2052 composite, all components of that `DatasetType` will also be 

2053 ignored. 

2054 

2055 Raises 

2056 ------ 

2057 ButlerValidationError 

2058 Raised if there is some inconsistency with how this Butler 

2059 is configured. 

2060 """ 

2061 if datasetTypeNames: 

2062 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2063 else: 

2064 datasetTypes = list(self.registry.queryDatasetTypes()) 

2065 

2066 # filter out anything from the ignore list 

2067 if ignore: 

2068 ignore = set(ignore) 

2069 datasetTypes = [e for e in datasetTypes 

2070 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

2071 else: 

2072 ignore = set() 

2073 

2074 # Find all the registered instruments 

2075 instruments = set( 

2076 record.name for record in self.registry.queryDimensionRecords("instrument") 

2077 ) 

2078 

2079 # For each datasetType that has an instrument dimension, create 

2080 # a DatasetRef for each defined instrument 

2081 datasetRefs = [] 

2082 

2083 for datasetType in datasetTypes: 

2084 if "instrument" in datasetType.dimensions: 

2085 for instrument in instruments: 

2086 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

2087 conform=False) 

2088 datasetRefs.append(datasetRef) 

2089 

2090 entities: List[Union[DatasetType, DatasetRef]] = [] 

2091 entities.extend(datasetTypes) 

2092 entities.extend(datasetRefs) 

2093 

2094 datastoreErrorStr = None 

2095 try: 

2096 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2097 except ValidationError as e: 

2098 datastoreErrorStr = str(e) 

2099 

2100 # Also check that the LookupKeys used by the datastores match 

2101 # registry and storage class definitions 

2102 keys = self.datastore.getLookupKeys() 

2103 

2104 failedNames = set() 

2105 failedDataId = set() 

2106 for key in keys: 

2107 if key.name is not None: 

2108 if key.name in ignore: 

2109 continue 

2110 

2111 # skip if specific datasetType names were requested and this 

2112 # name does not match 

2113 if datasetTypeNames and key.name not in datasetTypeNames: 

2114 continue 

2115 

2116 # See if it is a StorageClass or a DatasetType 

2117 if key.name in self.storageClasses: 

2118 pass 

2119 else: 

2120 try: 

2121 self.registry.getDatasetType(key.name) 

2122 except KeyError: 

2123 if logFailures: 

2124 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2125 failedNames.add(key) 

2126 else: 

2127 # Dimensions are checked for consistency when the Butler 

2128 # is created and rendezvoused with a universe. 

2129 pass 

2130 

2131 # Check that the instrument is a valid instrument 

2132 # Currently only support instrument so check for that 

2133 if key.dataId: 

2134 dataIdKeys = set(key.dataId) 

2135 if set(["instrument"]) != dataIdKeys: 

2136 if logFailures: 

2137 log.critical("Key '%s' has unsupported DataId override", key) 

2138 failedDataId.add(key) 

2139 elif key.dataId["instrument"] not in instruments: 

2140 if logFailures: 

2141 log.critical("Key '%s' has unknown instrument", key) 

2142 failedDataId.add(key) 

2143 

2144 messages = [] 

2145 

2146 if datastoreErrorStr: 

2147 messages.append(datastoreErrorStr) 

2148 

2149 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2150 (failedDataId, "Keys with bad DataId entries: ")): 

2151 if failed: 

2152 msg += ", ".join(str(k) for k in failed) 

2153 messages.append(msg) 

2154 

2155 if messages: 

2156 raise ValidationError(";\n".join(messages)) 

2157 

2158 @property 

2159 def collections(self) -> CollectionSearch: 

2160 """The collections to search by default, in order (`CollectionSearch`). 

2161 

2162 This is an alias for ``self.registry.defaults.collections``. It cannot 

2163 be set directly in isolation, but all defaults may be changed together 

2164 by assigning a new `RegistryDefaults` instance to 

2165 ``self.registry.defaults``. 

2166 """ 

2167 return self.registry.defaults.collections 

2168 

2169 @property 

2170 def run(self) -> Optional[str]: 

2171 """Name of the run this butler writes outputs to by default (`str` or 

2172 `None`). 

2173 

2174 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2175 directly in isolation, but all defaults may be changed together by 

2176 assigning a new `RegistryDefaults` instance to 

2177 ``self.registry.defaults``. 

2178 """ 

2179 return self.registry.defaults.run 

2180 

2181 registry: Registry 

2182 """The object that manages dataset metadata and relationships (`Registry`). 

2183 

2184 Most operations that don't involve reading or writing butler datasets are 

2185 accessible only via `Registry` methods. 

2186 """ 

2187 

2188 datastore: Datastore 

2189 """The object that manages actual dataset storage (`Datastore`). 

2190 

2191 Direct user access to the datastore should rarely be necessary; the primary 

2192 exception is the case where a `Datastore` implementation provides extra 

2193 functionality beyond what the base class defines. 

2194 """ 

2195 

2196 storageClasses: StorageClassFactory 

2197 """An object that maps known storage class names to objects that fully 

2198 describe them (`StorageClassFactory`). 

2199 """ 

2200 

2201 _allow_put_of_predefined_dataset: bool 

2202 """Allow a put to succeed even if there is already a registry entry for it 

2203 but not a datastore record. (`bool`)."""