Coverage for python/lsst/daf/butler/_butler.py: 10%

Shortcuts on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

614 statements  

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37import contextlib 

38import logging 

39import numbers 

40import os 

41from collections import defaultdict 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from lsst.resources import ResourcePath, ResourcePathExpression 

60from lsst.utils import doImportType 

61from lsst.utils.introspection import get_class_of 

62from lsst.utils.logging import VERBOSE, getLogger 

63 

64from ._butlerConfig import ButlerConfig 

65from ._butlerRepoIndex import ButlerRepoIndex 

66from ._deferredDatasetHandle import DeferredDatasetHandle 

67from .core import ( 

68 AmbiguousDatasetError, 

69 Config, 

70 ConfigSubset, 

71 DataCoordinate, 

72 DataId, 

73 DataIdValue, 

74 DatasetRef, 

75 DatasetType, 

76 Datastore, 

77 Dimension, 

78 DimensionConfig, 

79 FileDataset, 

80 Progress, 

81 StorageClassFactory, 

82 Timespan, 

83 ValidationError, 

84) 

85from .core.repoRelocation import BUTLER_ROOT_TAG 

86from .core.utils import transactional 

87from .registry import ( 

88 CollectionSearch, 

89 CollectionType, 

90 ConflictingDefinitionError, 

91 DatasetIdGenEnum, 

92 Registry, 

93 RegistryConfig, 

94 RegistryDefaults, 

95) 

96from .transfers import RepoExportContext 

97 

98log = getLogger(__name__) 

99 

100 

101class ButlerValidationError(ValidationError): 

102 """There is a problem with the Butler configuration.""" 

103 

104 pass 

105 

106 

107class PruneCollectionsArgsError(TypeError): 

108 """Base class for errors relating to Butler.pruneCollections input 

109 arguments. 

110 """ 

111 

112 pass 

113 

114 

115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

116 """Raised when purge and unstore are both required to be True, and 

117 purge is True but unstore is False. 

118 """ 

119 

120 def __init__(self) -> None: 

121 super().__init__("Cannot pass purge=True without unstore=True.") 

122 

123 

124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

125 """Raised when pruning a RUN collection but purge is False.""" 

126 

127 def __init__(self, collectionType: CollectionType): 

128 self.collectionType = collectionType 

129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

130 

131 

132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

133 """Raised when purge is True but is not supported for the given 

134 collection.""" 

135 

136 def __init__(self, collectionType: CollectionType): 

137 self.collectionType = collectionType 

138 super().__init__( 

139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True." 

140 ) 

141 

142 

143class Butler: 

144 """Main entry point for the data access system. 

145 

146 Parameters 

147 ---------- 

148 config : `ButlerConfig`, `Config` or `str`, optional. 

149 Configuration. Anything acceptable to the 

150 `ButlerConfig` constructor. If a directory path 

151 is given the configuration will be read from a ``butler.yaml`` file in 

152 that location. If `None` is given default values will be used. 

153 butler : `Butler`, optional. 

154 If provided, construct a new Butler that uses the same registry and 

155 datastore as the given one, but with the given collection and run. 

156 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

157 arguments. 

158 collections : `str` or `Iterable` [ `str` ], optional 

159 An expression specifying the collections to be searched (in order) when 

160 reading datasets. 

161 This may be a `str` collection name or an iterable thereof. 

162 See :ref:`daf_butler_collection_expressions` for more information. 

163 These collections are not registered automatically and must be 

164 manually registered before they are used by any method, but they may be 

165 manually registered after the `Butler` is initialized. 

166 run : `str`, optional 

167 Name of the `~CollectionType.RUN` collection new datasets should be 

168 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

169 ``collections`` will be set to ``[run]``. If not `None`, this 

170 collection will automatically be registered. If this is not set (and 

171 ``writeable`` is not set either), a read-only butler will be created. 

172 searchPaths : `list` of `str`, optional 

173 Directory paths to search when calculating the full Butler 

174 configuration. Not used if the supplied config is already a 

175 `ButlerConfig`. 

176 writeable : `bool`, optional 

177 Explicitly sets whether the butler supports write operations. If not 

178 provided, a read-write butler is created if any of ``run``, ``tags``, 

179 or ``chains`` is non-empty. 

180 inferDefaults : `bool`, optional 

181 If `True` (default) infer default data ID values from the values 

182 present in the datasets in ``collections``: if all collections have the 

183 same value (or no value) for a governor dimension, that value will be 

184 the default for that dimension. Nonexistent collections are ignored. 

185 If a default value is provided explicitly for a governor dimension via 

186 ``**kwargs``, no default will be inferred for that dimension. 

187 **kwargs : `str` 

188 Default data ID key-value pairs. These may only identify "governor" 

189 dimensions like ``instrument`` and ``skymap``. 

190 

191 Examples 

192 -------- 

193 While there are many ways to control exactly how a `Butler` interacts with 

194 the collections in its `Registry`, the most common cases are still simple. 

195 

196 For a read-only `Butler` that searches one collection, do:: 

197 

198 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

199 

200 For a read-write `Butler` that writes to and reads from a 

201 `~CollectionType.RUN` collection:: 

202 

203 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

204 

205 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

206 because we want to write to one `~CollectionType.RUN` collection but read 

207 from several others (as well):: 

208 

209 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

210 collections=["u/alice/DM-50000/a", 

211 "u/bob/DM-49998", 

212 "HSC/defaults"]) 

213 

214 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

215 Datasets will be read first from that run (since it appears first in the 

216 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

217 

218 Finally, one can always create a `Butler` with no collections:: 

219 

220 butler = Butler("/path/to/repo", writeable=True) 

221 

222 This can be extremely useful when you just want to use ``butler.registry``, 

223 e.g. for inserting dimension data or managing collections, or when the 

224 collections you want to use with the butler are not consistent. 

225 Passing ``writeable`` explicitly here is only necessary if you want to be 

226 able to make changes to the repo - usually the value for ``writeable`` can 

227 be guessed from the collection arguments provided, but it defaults to 

228 `False` when there are not collection arguments. 

229 """ 

230 

231 def __init__( 

232 self, 

233 config: Union[Config, str, None] = None, 

234 *, 

235 butler: Optional[Butler] = None, 

236 collections: Any = None, 

237 run: Optional[str] = None, 

238 searchPaths: Optional[List[str]] = None, 

239 writeable: Optional[bool] = None, 

240 inferDefaults: bool = True, 

241 **kwargs: str, 

242 ): 

243 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

244 # Load registry, datastore, etc. from config or existing butler. 

245 if butler is not None: 

246 if config is not None or searchPaths is not None or writeable is not None: 

247 raise TypeError( 

248 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

249 ) 

250 self.registry = butler.registry.copy(defaults) 

251 self.datastore = butler.datastore 

252 self.storageClasses = butler.storageClasses 

253 self._config: ButlerConfig = butler._config 

254 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

255 else: 

256 self._config = ButlerConfig(config, searchPaths=searchPaths) 

257 try: 

258 if "root" in self._config: 

259 butlerRoot = self._config["root"] 

260 else: 

261 butlerRoot = self._config.configDir 

262 if writeable is None: 

263 writeable = run is not None 

264 self.registry = Registry.fromConfig( 

265 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

266 ) 

267 self.datastore = Datastore.fromConfig( 

268 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

269 ) 

270 self.storageClasses = StorageClassFactory() 

271 self.storageClasses.addFromConfig(self._config) 

272 self._allow_put_of_predefined_dataset = self._config.get( 

273 "allow_put_of_predefined_dataset", False 

274 ) 

275 except Exception: 

276 # Failures here usually mean that configuration is incomplete, 

277 # just issue an error message which includes config file URI. 

278 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

279 raise 

280 

281 if "run" in self._config or "collection" in self._config: 

282 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

283 

284 GENERATION: ClassVar[int] = 3 

285 """This is a Generation 3 Butler. 

286 

287 This attribute may be removed in the future, once the Generation 2 Butler 

288 interface has been fully retired; it should only be used in transitional 

289 code. 

290 """ 

291 

292 @classmethod 

293 def get_repo_uri(cls, label: str) -> ResourcePath: 

294 """Look up the label in a butler repository index. 

295 

296 Parameters 

297 ---------- 

298 label : `str` 

299 Label of the Butler repository to look up. 

300 

301 Returns 

302 ------- 

303 uri : `lsst.resources.ResourcePath` 

304 URI to the Butler repository associated with the given label. 

305 

306 Raises 

307 ------ 

308 KeyError 

309 Raised if the label is not found in the index, or if an index 

310 can not be found at all. 

311 

312 Notes 

313 ----- 

314 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

315 information is discovered. 

316 """ 

317 return ButlerRepoIndex.get_repo_uri(label) 

318 

319 @classmethod 

320 def get_known_repos(cls) -> Set[str]: 

321 """Retrieve the list of known repository labels. 

322 

323 Returns 

324 ------- 

325 repos : `set` of `str` 

326 All the known labels. Can be empty if no index can be found. 

327 

328 Notes 

329 ----- 

330 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

331 information is discovered. 

332 """ 

333 return ButlerRepoIndex.get_known_repos() 

334 

335 @staticmethod 

336 def makeRepo( 

337 root: ResourcePathExpression, 

338 config: Union[Config, str, None] = None, 

339 dimensionConfig: Union[Config, str, None] = None, 

340 standalone: bool = False, 

341 searchPaths: Optional[List[str]] = None, 

342 forceConfigRoot: bool = True, 

343 outfile: Optional[ResourcePathExpression] = None, 

344 overwrite: bool = False, 

345 ) -> Config: 

346 """Create an empty data repository by adding a butler.yaml config 

347 to a repository root directory. 

348 

349 Parameters 

350 ---------- 

351 root : `lsst.resources.ResourcePathExpression` 

352 Path or URI to the root location of the new repository. Will be 

353 created if it does not exist. 

354 config : `Config` or `str`, optional 

355 Configuration to write to the repository, after setting any 

356 root-dependent Registry or Datastore config options. Can not 

357 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

358 configuration will be used. Root-dependent config options 

359 specified in this config are overwritten if ``forceConfigRoot`` 

360 is `True`. 

361 dimensionConfig : `Config` or `str`, optional 

362 Configuration for dimensions, will be used to initialize registry 

363 database. 

364 standalone : `bool` 

365 If True, write all expanded defaults, not just customized or 

366 repository-specific settings. 

367 This (mostly) decouples the repository from the default 

368 configuration, insulating it from changes to the defaults (which 

369 may be good or bad, depending on the nature of the changes). 

370 Future *additions* to the defaults will still be picked up when 

371 initializing `Butlers` to repos created with ``standalone=True``. 

372 searchPaths : `list` of `str`, optional 

373 Directory paths to search when calculating the full butler 

374 configuration. 

375 forceConfigRoot : `bool`, optional 

376 If `False`, any values present in the supplied ``config`` that 

377 would normally be reset are not overridden and will appear 

378 directly in the output config. This allows non-standard overrides 

379 of the root directory for a datastore or registry to be given. 

380 If this parameter is `True` the values for ``root`` will be 

381 forced into the resulting config if appropriate. 

382 outfile : `lss.resources.ResourcePathExpression`, optional 

383 If not-`None`, the output configuration will be written to this 

384 location rather than into the repository itself. Can be a URI 

385 string. Can refer to a directory that will be used to write 

386 ``butler.yaml``. 

387 overwrite : `bool`, optional 

388 Create a new configuration file even if one already exists 

389 in the specified output location. Default is to raise 

390 an exception. 

391 

392 Returns 

393 ------- 

394 config : `Config` 

395 The updated `Config` instance written to the repo. 

396 

397 Raises 

398 ------ 

399 ValueError 

400 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

401 regular Config (as these subclasses would make it impossible to 

402 support ``standalone=False``). 

403 FileExistsError 

404 Raised if the output config file already exists. 

405 os.error 

406 Raised if the directory does not exist, exists but is not a 

407 directory, or cannot be created. 

408 

409 Notes 

410 ----- 

411 Note that when ``standalone=False`` (the default), the configuration 

412 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

413 construct the repository should also be used to construct any Butlers 

414 to avoid configuration inconsistencies. 

415 """ 

416 if isinstance(config, (ButlerConfig, ConfigSubset)): 

417 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

418 

419 # Ensure that the root of the repository exists or can be made 

420 root_uri = ResourcePath(root, forceDirectory=True) 

421 root_uri.mkdir() 

422 

423 config = Config(config) 

424 

425 # If we are creating a new repo from scratch with relative roots, 

426 # do not propagate an explicit root from the config file 

427 if "root" in config: 

428 del config["root"] 

429 

430 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

431 imported_class = doImportType(full["datastore", "cls"]) 

432 if not issubclass(imported_class, Datastore): 

433 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

434 datastoreClass: Type[Datastore] = imported_class 

435 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

436 

437 # if key exists in given config, parse it, otherwise parse the defaults 

438 # in the expanded config 

439 if config.get(("registry", "db")): 

440 registryConfig = RegistryConfig(config) 

441 else: 

442 registryConfig = RegistryConfig(full) 

443 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

444 if defaultDatabaseUri is not None: 

445 Config.updateParameters( 

446 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

447 ) 

448 else: 

449 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

450 

451 if standalone: 

452 config.merge(full) 

453 else: 

454 # Always expand the registry.managers section into the per-repo 

455 # config, because after the database schema is created, it's not 

456 # allowed to change anymore. Note that in the standalone=True 

457 # branch, _everything_ in the config is expanded, so there's no 

458 # need to special case this. 

459 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

460 configURI: ResourcePathExpression 

461 if outfile is not None: 

462 # When writing to a separate location we must include 

463 # the root of the butler repo in the config else it won't know 

464 # where to look. 

465 config["root"] = root_uri.geturl() 

466 configURI = outfile 

467 else: 

468 configURI = root_uri 

469 config.dumpToUri(configURI, overwrite=overwrite) 

470 

471 # Create Registry and populate tables 

472 registryConfig = RegistryConfig(config.get("registry")) 

473 dimensionConfig = DimensionConfig(dimensionConfig) 

474 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

475 

476 log.verbose("Wrote new Butler configuration file to %s", configURI) 

477 

478 return config 

479 

480 @classmethod 

481 def _unpickle( 

482 cls, 

483 config: ButlerConfig, 

484 collections: Optional[CollectionSearch], 

485 run: Optional[str], 

486 defaultDataId: Dict[str, str], 

487 writeable: bool, 

488 ) -> Butler: 

489 """Callable used to unpickle a Butler. 

490 

491 We prefer not to use ``Butler.__init__`` directly so we can force some 

492 of its many arguments to be keyword-only (note that ``__reduce__`` 

493 can only invoke callables with positional arguments). 

494 

495 Parameters 

496 ---------- 

497 config : `ButlerConfig` 

498 Butler configuration, already coerced into a true `ButlerConfig` 

499 instance (and hence after any search paths for overrides have been 

500 utilized). 

501 collections : `CollectionSearch` 

502 Names of the default collections to read from. 

503 run : `str`, optional 

504 Name of the default `~CollectionType.RUN` collection to write to. 

505 defaultDataId : `dict` [ `str`, `str` ] 

506 Default data ID values. 

507 writeable : `bool` 

508 Whether the Butler should support write operations. 

509 

510 Returns 

511 ------- 

512 butler : `Butler` 

513 A new `Butler` instance. 

514 """ 

515 # MyPy doesn't recognize that the kwargs below are totally valid; it 

516 # seems to think '**defaultDataId* is a _positional_ argument! 

517 return cls( 

518 config=config, 

519 collections=collections, 

520 run=run, 

521 writeable=writeable, 

522 **defaultDataId, # type: ignore 

523 ) 

524 

525 def __reduce__(self) -> tuple: 

526 """Support pickling.""" 

527 return ( 

528 Butler._unpickle, 

529 ( 

530 self._config, 

531 self.collections, 

532 self.run, 

533 self.registry.defaults.dataId.byName(), 

534 self.registry.isWriteable(), 

535 ), 

536 ) 

537 

538 def __str__(self) -> str: 

539 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

540 self.collections, self.run, self.datastore, self.registry 

541 ) 

542 

543 def isWriteable(self) -> bool: 

544 """Return `True` if this `Butler` supports write operations.""" 

545 return self.registry.isWriteable() 

546 

547 @contextlib.contextmanager 

548 def transaction(self) -> Iterator[None]: 

549 """Context manager supporting `Butler` transactions. 

550 

551 Transactions can be nested. 

552 """ 

553 with self.registry.transaction(): 

554 with self.datastore.transaction(): 

555 yield 

556 

557 def _standardizeArgs( 

558 self, 

559 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

560 dataId: Optional[DataId] = None, 

561 **kwargs: Any, 

562 ) -> Tuple[DatasetType, Optional[DataId]]: 

563 """Standardize the arguments passed to several Butler APIs. 

564 

565 Parameters 

566 ---------- 

567 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

568 When `DatasetRef` the `dataId` should be `None`. 

569 Otherwise the `DatasetType` or name thereof. 

570 dataId : `dict` or `DataCoordinate` 

571 A `dict` of `Dimension` link name, value pairs that label the 

572 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

573 should be provided as the second argument. 

574 **kwargs 

575 Additional keyword arguments used to augment or construct a 

576 `DataCoordinate`. See `DataCoordinate.standardize` 

577 parameters. 

578 

579 Returns 

580 ------- 

581 datasetType : `DatasetType` 

582 A `DatasetType` instance extracted from ``datasetRefOrType``. 

583 dataId : `dict` or `DataId`, optional 

584 Argument that can be used (along with ``kwargs``) to construct a 

585 `DataId`. 

586 

587 Notes 

588 ----- 

589 Butler APIs that conceptually need a DatasetRef also allow passing a 

590 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

591 keyword arguments that can be used to construct one) separately. This 

592 method accepts those arguments and always returns a true `DatasetType` 

593 and a `DataId` or `dict`. 

594 

595 Standardization of `dict` vs `DataId` is best handled by passing the 

596 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

597 generally similarly flexible. 

598 """ 

599 externalDatasetType: Optional[DatasetType] = None 

600 internalDatasetType: Optional[DatasetType] = None 

601 if isinstance(datasetRefOrType, DatasetRef): 

602 if dataId is not None or kwargs: 

603 raise ValueError("DatasetRef given, cannot use dataId as well") 

604 externalDatasetType = datasetRefOrType.datasetType 

605 dataId = datasetRefOrType.dataId 

606 else: 

607 # Don't check whether DataId is provided, because Registry APIs 

608 # can usually construct a better error message when it wasn't. 

609 if isinstance(datasetRefOrType, DatasetType): 

610 externalDatasetType = datasetRefOrType 

611 else: 

612 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

613 

614 # Check that they are self-consistent 

615 if externalDatasetType is not None: 

616 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

617 if externalDatasetType != internalDatasetType: 

618 raise ValueError( 

619 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

620 f"registry definition ({internalDatasetType})" 

621 ) 

622 

623 assert internalDatasetType is not None 

624 return internalDatasetType, dataId 

625 

626 def _rewrite_data_id( 

627 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

628 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

629 """Rewrite a data ID taking into account dimension records. 

630 

631 Take a Data ID and keyword args and rewrite it if necessary to 

632 allow the user to specify dimension records rather than dimension 

633 primary values. 

634 

635 This allows a user to include a dataId dict with keys of 

636 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

637 the integer exposure ID. It also allows a string to be given 

638 for a dimension value rather than the integer ID if that is more 

639 convenient. For example, rather than having to specifyin the 

640 detector with ``detector.full_name``, a string given for ``detector`` 

641 will be interpreted as the full name and converted to the integer 

642 value. 

643 

644 Keyword arguments can also use strings for dimensions like detector 

645 and exposure but python does not allow them to include ``.`` and 

646 so the ``exposure.day_obs`` syntax can not be used in a keyword 

647 argument. 

648 

649 Parameters 

650 ---------- 

651 dataId : `dict` or `DataCoordinate` 

652 A `dict` of `Dimension` link name, value pairs that will label the 

653 `DatasetRef` within a Collection. 

654 datasetType : `DatasetType` 

655 The dataset type associated with this dataId. Required to 

656 determine the relevant dimensions. 

657 **kwargs 

658 Additional keyword arguments used to augment or construct a 

659 `DataId`. See `DataId` parameters. 

660 

661 Returns 

662 ------- 

663 dataId : `dict` or `DataCoordinate` 

664 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

665 no keyword arguments, the original dataId will be returned 

666 unchanged. 

667 **kwargs : `dict` 

668 Any unused keyword arguments (would normally be empty dict). 

669 """ 

670 # Do nothing if we have a standalone DataCoordinate. 

671 if isinstance(dataId, DataCoordinate) and not kwargs: 

672 return dataId, kwargs 

673 

674 # Process dimension records that are using record information 

675 # rather than ids 

676 newDataId: Dict[str, DataIdValue] = {} 

677 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

678 

679 # if all the dataId comes from keyword parameters we do not need 

680 # to do anything here because they can't be of the form 

681 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

682 if dataId: 

683 for k, v in dataId.items(): 

684 # If we have a Dimension we do not need to do anything 

685 # because it cannot be a compound key. 

686 if isinstance(k, str) and "." in k: 

687 # Someone is using a more human-readable dataId 

688 dimensionName, record = k.split(".", 1) 

689 byRecord[dimensionName][record] = v 

690 elif isinstance(k, Dimension): 

691 newDataId[k.name] = v 

692 else: 

693 newDataId[k] = v 

694 

695 # Go through the updated dataId and check the type in case someone is 

696 # using an alternate key. We have already filtered out the compound 

697 # keys dimensions.record format. 

698 not_dimensions = {} 

699 

700 # Will need to look in the dataId and the keyword arguments 

701 # and will remove them if they need to be fixed or are unrecognized. 

702 for dataIdDict in (newDataId, kwargs): 

703 # Use a list so we can adjust the dict safely in the loop 

704 for dimensionName in list(dataIdDict): 

705 value = dataIdDict[dimensionName] 

706 try: 

707 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

708 except KeyError: 

709 # This is not a real dimension 

710 not_dimensions[dimensionName] = value 

711 del dataIdDict[dimensionName] 

712 continue 

713 

714 # Convert an integral type to an explicit int to simplify 

715 # comparisons here 

716 if isinstance(value, numbers.Integral): 

717 value = int(value) 

718 

719 if not isinstance(value, dimension.primaryKey.getPythonType()): 

720 for alternate in dimension.alternateKeys: 

721 if isinstance(value, alternate.getPythonType()): 

722 byRecord[dimensionName][alternate.name] = value 

723 del dataIdDict[dimensionName] 

724 log.debug( 

725 "Converting dimension %s to %s.%s=%s", 

726 dimensionName, 

727 dimensionName, 

728 alternate.name, 

729 value, 

730 ) 

731 break 

732 else: 

733 log.warning( 

734 "Type mismatch found for value '%r' provided for dimension %s. " 

735 "Could not find matching alternative (primary key has type %s) " 

736 "so attempting to use as-is.", 

737 value, 

738 dimensionName, 

739 dimension.primaryKey.getPythonType(), 

740 ) 

741 

742 # By this point kwargs and newDataId should only include valid 

743 # dimensions. Merge kwargs in to the new dataId and log if there 

744 # are dimensions in both (rather than calling update). 

745 for k, v in kwargs.items(): 

746 if k in newDataId and newDataId[k] != v: 

747 log.debug( 

748 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

749 ) 

750 newDataId[k] = v 

751 # No need to retain any values in kwargs now. 

752 kwargs = {} 

753 

754 # If we have some unrecognized dimensions we have to try to connect 

755 # them to records in other dimensions. This is made more complicated 

756 # by some dimensions having records with clashing names. A mitigation 

757 # is that we can tell by this point which dimensions are missing 

758 # for the DatasetType but this does not work for calibrations 

759 # where additional dimensions can be used to constrain the temporal 

760 # axis. 

761 if not_dimensions: 

762 # Search for all dimensions even if we have been given a value 

763 # explicitly. In some cases records are given as well as the 

764 # actually dimension and this should not be an error if they 

765 # match. 

766 mandatoryDimensions = datasetType.dimensions.names # - provided 

767 

768 candidateDimensions: Set[str] = set() 

769 candidateDimensions.update(mandatoryDimensions) 

770 

771 # For calibrations we may well be needing temporal dimensions 

772 # so rather than always including all dimensions in the scan 

773 # restrict things a little. It is still possible for there 

774 # to be confusion over day_obs in visit vs exposure for example. 

775 # If we are not searching calibration collections things may 

776 # fail but they are going to fail anyway because of the 

777 # ambiguousness of the dataId... 

778 if datasetType.isCalibration(): 

779 for dim in self.registry.dimensions.getStaticDimensions(): 

780 if dim.temporal: 

781 candidateDimensions.add(str(dim)) 

782 

783 # Look up table for the first association with a dimension 

784 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

785 

786 # Keep track of whether an item is associated with multiple 

787 # dimensions. 

788 counter: Counter[str] = Counter() 

789 assigned: Dict[str, Set[str]] = defaultdict(set) 

790 

791 # Go through the missing dimensions and associate the 

792 # given names with records within those dimensions 

793 matched_dims = set() 

794 for dimensionName in candidateDimensions: 

795 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

796 fields = dimension.metadata.names | dimension.uniqueKeys.names 

797 for field in not_dimensions: 

798 if field in fields: 

799 guessedAssociation[dimensionName][field] = not_dimensions[field] 

800 counter[dimensionName] += 1 

801 assigned[field].add(dimensionName) 

802 matched_dims.add(field) 

803 

804 # Calculate the fields that matched nothing. 

805 never_found = set(not_dimensions) - matched_dims 

806 

807 if never_found: 

808 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

809 

810 # There is a chance we have allocated a single dataId item 

811 # to multiple dimensions. Need to decide which should be retained. 

812 # For now assume that the most popular alternative wins. 

813 # This means that day_obs with seq_num will result in 

814 # exposure.day_obs and not visit.day_obs 

815 # Also prefer an explicitly missing dimension over an inferred 

816 # temporal dimension. 

817 for fieldName, assignedDimensions in assigned.items(): 

818 if len(assignedDimensions) > 1: 

819 # Pick the most popular (preferring mandatory dimensions) 

820 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

821 if requiredButMissing: 

822 candidateDimensions = requiredButMissing 

823 else: 

824 candidateDimensions = assignedDimensions 

825 

826 # Select the relevant items and get a new restricted 

827 # counter. 

828 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

829 duplicatesCounter: Counter[str] = Counter() 

830 duplicatesCounter.update(theseCounts) 

831 

832 # Choose the most common. If they are equally common 

833 # we will pick the one that was found first. 

834 # Returns a list of tuples 

835 selected = duplicatesCounter.most_common(1)[0][0] 

836 

837 log.debug( 

838 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

839 " Removed ambiguity by choosing dimension %s.", 

840 fieldName, 

841 ", ".join(assignedDimensions), 

842 selected, 

843 ) 

844 

845 for candidateDimension in assignedDimensions: 

846 if candidateDimension != selected: 

847 del guessedAssociation[candidateDimension][fieldName] 

848 

849 # Update the record look up dict with the new associations 

850 for dimensionName, values in guessedAssociation.items(): 

851 if values: # A dict might now be empty 

852 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

853 byRecord[dimensionName].update(values) 

854 

855 if byRecord: 

856 # Some record specifiers were found so we need to convert 

857 # them to the Id form 

858 for dimensionName, values in byRecord.items(): 

859 if dimensionName in newDataId: 

860 log.debug( 

861 "DataId specified explicit %s dimension value of %s in addition to" 

862 " general record specifiers for it of %s. Ignoring record information.", 

863 dimensionName, 

864 newDataId[dimensionName], 

865 str(values), 

866 ) 

867 # Get the actual record and compare with these values. 

868 try: 

869 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

870 except LookupError: 

871 raise ValueError( 

872 f"Could not find dimension '{dimensionName}'" 

873 f" with dataId {newDataId} as part of comparing with" 

874 f" record values {byRecord[dimensionName]}" 

875 ) from None 

876 if len(recs) == 1: 

877 errmsg: List[str] = [] 

878 for k, v in values.items(): 

879 if (recval := getattr(recs[0], k)) != v: 

880 errmsg.append(f"{k}({recval} != {v})") 

881 if errmsg: 

882 raise ValueError( 

883 f"Dimension {dimensionName} in dataId has explicit value" 

884 " inconsistent with records: " + ", ".join(errmsg) 

885 ) 

886 else: 

887 # Multiple matches for an explicit dimension 

888 # should never happen but let downstream complain. 

889 pass 

890 continue 

891 

892 # Build up a WHERE expression 

893 bind = {k: v for k, v in values.items()} 

894 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

895 

896 # Hopefully we get a single record that matches 

897 records = set( 

898 self.registry.queryDimensionRecords( 

899 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

900 ) 

901 ) 

902 

903 if len(records) != 1: 

904 if len(records) > 1: 

905 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

906 for r in records: 

907 log.debug("- %s", str(r)) 

908 raise ValueError( 

909 f"DataId specification for dimension {dimensionName} is not" 

910 f" uniquely constrained to a single dataset by {values}." 

911 f" Got {len(records)} results." 

912 ) 

913 raise ValueError( 

914 f"DataId specification for dimension {dimensionName} matched no" 

915 f" records when constrained by {values}" 

916 ) 

917 

918 # Get the primary key from the real dimension object 

919 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

920 if not isinstance(dimension, Dimension): 

921 raise RuntimeError( 

922 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

923 ) 

924 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

925 

926 return newDataId, kwargs 

927 

928 def _findDatasetRef( 

929 self, 

930 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

931 dataId: Optional[DataId] = None, 

932 *, 

933 collections: Any = None, 

934 allowUnresolved: bool = False, 

935 **kwargs: Any, 

936 ) -> DatasetRef: 

937 """Shared logic for methods that start with a search for a dataset in 

938 the registry. 

939 

940 Parameters 

941 ---------- 

942 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

943 When `DatasetRef` the `dataId` should be `None`. 

944 Otherwise the `DatasetType` or name thereof. 

945 dataId : `dict` or `DataCoordinate`, optional 

946 A `dict` of `Dimension` link name, value pairs that label the 

947 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

948 should be provided as the first argument. 

949 collections : Any, optional 

950 Collections to be searched, overriding ``self.collections``. 

951 Can be any of the types supported by the ``collections`` argument 

952 to butler construction. 

953 allowUnresolved : `bool`, optional 

954 If `True`, return an unresolved `DatasetRef` if finding a resolved 

955 one in the `Registry` fails. Defaults to `False`. 

956 **kwargs 

957 Additional keyword arguments used to augment or construct a 

958 `DataId`. See `DataId` parameters. 

959 

960 Returns 

961 ------- 

962 ref : `DatasetRef` 

963 A reference to the dataset identified by the given arguments. 

964 

965 Raises 

966 ------ 

967 LookupError 

968 Raised if no matching dataset exists in the `Registry` (and 

969 ``allowUnresolved is False``). 

970 ValueError 

971 Raised if a resolved `DatasetRef` was passed as an input, but it 

972 differs from the one found in the registry. 

973 TypeError 

974 Raised if no collections were provided. 

975 """ 

976 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

977 if isinstance(datasetRefOrType, DatasetRef): 

978 idNumber = datasetRefOrType.id 

979 else: 

980 idNumber = None 

981 timespan: Optional[Timespan] = None 

982 

983 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

984 

985 if datasetType.isCalibration(): 

986 # Because this is a calibration dataset, first try to make a 

987 # standardize the data ID without restricting the dimensions to 

988 # those of the dataset type requested, because there may be extra 

989 # dimensions that provide temporal information for a validity-range 

990 # lookup. 

991 dataId = DataCoordinate.standardize( 

992 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

993 ) 

994 if dataId.graph.temporal: 

995 dataId = self.registry.expandDataId(dataId) 

996 timespan = dataId.timespan 

997 else: 

998 # Standardize the data ID to just the dimensions of the dataset 

999 # type instead of letting registry.findDataset do it, so we get the 

1000 # result even if no dataset is found. 

1001 dataId = DataCoordinate.standardize( 

1002 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1003 ) 

1004 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1005 # present in the current collection. 

1006 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1007 if ref is None: 

1008 if allowUnresolved: 

1009 return DatasetRef(datasetType, dataId) 

1010 else: 

1011 if collections is None: 

1012 collections = self.registry.defaults.collections 

1013 raise LookupError( 

1014 f"Dataset {datasetType.name} with data ID {dataId} " 

1015 f"could not be found in collections {collections}." 

1016 ) 

1017 if idNumber is not None and idNumber != ref.id: 

1018 if collections is None: 

1019 collections = self.registry.defaults.collections 

1020 raise ValueError( 

1021 f"DatasetRef.id provided ({idNumber}) does not match " 

1022 f"id ({ref.id}) in registry in collections {collections}." 

1023 ) 

1024 return ref 

1025 

1026 @transactional 

1027 def put( 

1028 self, 

1029 obj: Any, 

1030 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1031 dataId: Optional[DataId] = None, 

1032 *, 

1033 run: Optional[str] = None, 

1034 **kwargs: Any, 

1035 ) -> DatasetRef: 

1036 """Store and register a dataset. 

1037 

1038 Parameters 

1039 ---------- 

1040 obj : `object` 

1041 The dataset. 

1042 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1043 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1044 Otherwise the `DatasetType` or name thereof. 

1045 dataId : `dict` or `DataCoordinate` 

1046 A `dict` of `Dimension` link name, value pairs that label the 

1047 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1048 should be provided as the second argument. 

1049 run : `str`, optional 

1050 The name of the run the dataset should be added to, overriding 

1051 ``self.run``. 

1052 **kwargs 

1053 Additional keyword arguments used to augment or construct a 

1054 `DataCoordinate`. See `DataCoordinate.standardize` 

1055 parameters. 

1056 

1057 Returns 

1058 ------- 

1059 ref : `DatasetRef` 

1060 A reference to the stored dataset, updated with the correct id if 

1061 given. 

1062 

1063 Raises 

1064 ------ 

1065 TypeError 

1066 Raised if the butler is read-only or if no run has been provided. 

1067 """ 

1068 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1069 if not self.isWriteable(): 

1070 raise TypeError("Butler is read-only.") 

1071 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1072 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1073 raise ValueError("DatasetRef must not be in registry, must have None id") 

1074 

1075 # Handle dimension records in dataId 

1076 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1077 

1078 # Add Registry Dataset entry. 

1079 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1080 

1081 # For an execution butler the datasets will be pre-defined. 

1082 # If the butler is configured that way datasets should only be inserted 

1083 # if they do not already exist in registry. Trying and catching 

1084 # ConflictingDefinitionError will not work because the transaction 

1085 # will be corrupted. Instead, in this mode always check first. 

1086 ref = None 

1087 ref_is_predefined = False 

1088 if self._allow_put_of_predefined_dataset: 

1089 # Get the matching ref for this run. 

1090 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1091 

1092 if ref: 

1093 # Must be expanded form for datastore templating 

1094 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1095 ref = ref.expanded(dataId) 

1096 ref_is_predefined = True 

1097 

1098 if not ref: 

1099 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1100 

1101 # If the ref is predefined it is possible that the datastore also 

1102 # has the record. Asking datastore to put it again will result in 

1103 # the artifact being recreated, overwriting previous, then will cause 

1104 # a failure in writing the record which will cause the artifact 

1105 # to be removed. Much safer to ask first before attempting to 

1106 # overwrite. Race conditions should not be an issue for the 

1107 # execution butler environment. 

1108 if ref_is_predefined: 

1109 if self.datastore.knows(ref): 

1110 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1111 

1112 self.datastore.put(obj, ref) 

1113 

1114 return ref 

1115 

1116 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

1117 """Retrieve a stored dataset. 

1118 

1119 Unlike `Butler.get`, this method allows datasets outside the Butler's 

1120 collection to be read as long as the `DatasetRef` that identifies them 

1121 can be obtained separately. 

1122 

1123 Parameters 

1124 ---------- 

1125 ref : `DatasetRef` 

1126 Resolved reference to an already stored dataset. 

1127 parameters : `dict` 

1128 Additional StorageClass-defined options to control reading, 

1129 typically used to efficiently read only a subset of the dataset. 

1130 

1131 Returns 

1132 ------- 

1133 obj : `object` 

1134 The dataset. 

1135 """ 

1136 return self.datastore.get(ref, parameters=parameters) 

1137 

1138 def getDirectDeferred( 

1139 self, ref: DatasetRef, *, parameters: Union[dict, None] = None 

1140 ) -> DeferredDatasetHandle: 

1141 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1142 from a resolved `DatasetRef`. 

1143 

1144 Parameters 

1145 ---------- 

1146 ref : `DatasetRef` 

1147 Resolved reference to an already stored dataset. 

1148 parameters : `dict` 

1149 Additional StorageClass-defined options to control reading, 

1150 typically used to efficiently read only a subset of the dataset. 

1151 

1152 Returns 

1153 ------- 

1154 obj : `DeferredDatasetHandle` 

1155 A handle which can be used to retrieve a dataset at a later time. 

1156 

1157 Raises 

1158 ------ 

1159 AmbiguousDatasetError 

1160 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1161 """ 

1162 if ref.id is None: 

1163 raise AmbiguousDatasetError( 

1164 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1165 ) 

1166 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1167 

1168 def getDeferred( 

1169 self, 

1170 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1171 dataId: Optional[DataId] = None, 

1172 *, 

1173 parameters: Union[dict, None] = None, 

1174 collections: Any = None, 

1175 **kwargs: Any, 

1176 ) -> DeferredDatasetHandle: 

1177 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1178 after an immediate registry lookup. 

1179 

1180 Parameters 

1181 ---------- 

1182 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1183 When `DatasetRef` the `dataId` should be `None`. 

1184 Otherwise the `DatasetType` or name thereof. 

1185 dataId : `dict` or `DataCoordinate`, optional 

1186 A `dict` of `Dimension` link name, value pairs that label the 

1187 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1188 should be provided as the first argument. 

1189 parameters : `dict` 

1190 Additional StorageClass-defined options to control reading, 

1191 typically used to efficiently read only a subset of the dataset. 

1192 collections : Any, optional 

1193 Collections to be searched, overriding ``self.collections``. 

1194 Can be any of the types supported by the ``collections`` argument 

1195 to butler construction. 

1196 **kwargs 

1197 Additional keyword arguments used to augment or construct a 

1198 `DataId`. See `DataId` parameters. 

1199 

1200 Returns 

1201 ------- 

1202 obj : `DeferredDatasetHandle` 

1203 A handle which can be used to retrieve a dataset at a later time. 

1204 

1205 Raises 

1206 ------ 

1207 LookupError 

1208 Raised if no matching dataset exists in the `Registry` (and 

1209 ``allowUnresolved is False``). 

1210 ValueError 

1211 Raised if a resolved `DatasetRef` was passed as an input, but it 

1212 differs from the one found in the registry. 

1213 TypeError 

1214 Raised if no collections were provided. 

1215 """ 

1216 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1217 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1218 

1219 def get( 

1220 self, 

1221 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1222 dataId: Optional[DataId] = None, 

1223 *, 

1224 parameters: Optional[Dict[str, Any]] = None, 

1225 collections: Any = None, 

1226 **kwargs: Any, 

1227 ) -> Any: 

1228 """Retrieve a stored dataset. 

1229 

1230 Parameters 

1231 ---------- 

1232 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1233 When `DatasetRef` the `dataId` should be `None`. 

1234 Otherwise the `DatasetType` or name thereof. 

1235 dataId : `dict` or `DataCoordinate` 

1236 A `dict` of `Dimension` link name, value pairs that label the 

1237 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1238 should be provided as the first argument. 

1239 parameters : `dict` 

1240 Additional StorageClass-defined options to control reading, 

1241 typically used to efficiently read only a subset of the dataset. 

1242 collections : Any, optional 

1243 Collections to be searched, overriding ``self.collections``. 

1244 Can be any of the types supported by the ``collections`` argument 

1245 to butler construction. 

1246 **kwargs 

1247 Additional keyword arguments used to augment or construct a 

1248 `DataCoordinate`. See `DataCoordinate.standardize` 

1249 parameters. 

1250 

1251 Returns 

1252 ------- 

1253 obj : `object` 

1254 The dataset. 

1255 

1256 Raises 

1257 ------ 

1258 ValueError 

1259 Raised if a resolved `DatasetRef` was passed as an input, but it 

1260 differs from the one found in the registry. 

1261 LookupError 

1262 Raised if no matching dataset exists in the `Registry`. 

1263 TypeError 

1264 Raised if no collections were provided. 

1265 

1266 Notes 

1267 ----- 

1268 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1269 this method requires that the given data ID include temporal dimensions 

1270 beyond the dimensions of the dataset type itself, in order to find the 

1271 dataset with the appropriate validity range. For example, a "bias" 

1272 dataset with native dimensions ``{instrument, detector}`` could be 

1273 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1274 ``exposure`` is a temporal dimension. 

1275 """ 

1276 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1277 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1278 return self.getDirect(ref, parameters=parameters) 

1279 

1280 def getURIs( 

1281 self, 

1282 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1283 dataId: Optional[DataId] = None, 

1284 *, 

1285 predict: bool = False, 

1286 collections: Any = None, 

1287 run: Optional[str] = None, 

1288 **kwargs: Any, 

1289 ) -> Tuple[Optional[ResourcePath], Dict[str, ResourcePath]]: 

1290 """Returns the URIs associated with the dataset. 

1291 

1292 Parameters 

1293 ---------- 

1294 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1295 When `DatasetRef` the `dataId` should be `None`. 

1296 Otherwise the `DatasetType` or name thereof. 

1297 dataId : `dict` or `DataCoordinate` 

1298 A `dict` of `Dimension` link name, value pairs that label the 

1299 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1300 should be provided as the first argument. 

1301 predict : `bool` 

1302 If `True`, allow URIs to be returned of datasets that have not 

1303 been written. 

1304 collections : Any, optional 

1305 Collections to be searched, overriding ``self.collections``. 

1306 Can be any of the types supported by the ``collections`` argument 

1307 to butler construction. 

1308 run : `str`, optional 

1309 Run to use for predictions, overriding ``self.run``. 

1310 **kwargs 

1311 Additional keyword arguments used to augment or construct a 

1312 `DataCoordinate`. See `DataCoordinate.standardize` 

1313 parameters. 

1314 

1315 Returns 

1316 ------- 

1317 primary : `lsst.resources.ResourcePath` 

1318 The URI to the primary artifact associated with this dataset. 

1319 If the dataset was disassembled within the datastore this 

1320 may be `None`. 

1321 components : `dict` 

1322 URIs to any components associated with the dataset artifact. 

1323 Can be empty if there are no components. 

1324 """ 

1325 ref = self._findDatasetRef( 

1326 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1327 ) 

1328 if ref.id is None: # only possible if predict is True 

1329 if run is None: 

1330 run = self.run 

1331 if run is None: 

1332 raise TypeError("Cannot predict location with run=None.") 

1333 # Lie about ID, because we can't guess it, and only 

1334 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1335 ref = ref.resolved(id=0, run=run) 

1336 return self.datastore.getURIs(ref, predict) 

1337 

1338 def getURI( 

1339 self, 

1340 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1341 dataId: Optional[DataId] = None, 

1342 *, 

1343 predict: bool = False, 

1344 collections: Any = None, 

1345 run: Optional[str] = None, 

1346 **kwargs: Any, 

1347 ) -> ResourcePath: 

1348 """Return the URI to the Dataset. 

1349 

1350 Parameters 

1351 ---------- 

1352 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1353 When `DatasetRef` the `dataId` should be `None`. 

1354 Otherwise the `DatasetType` or name thereof. 

1355 dataId : `dict` or `DataCoordinate` 

1356 A `dict` of `Dimension` link name, value pairs that label the 

1357 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1358 should be provided as the first argument. 

1359 predict : `bool` 

1360 If `True`, allow URIs to be returned of datasets that have not 

1361 been written. 

1362 collections : Any, optional 

1363 Collections to be searched, overriding ``self.collections``. 

1364 Can be any of the types supported by the ``collections`` argument 

1365 to butler construction. 

1366 run : `str`, optional 

1367 Run to use for predictions, overriding ``self.run``. 

1368 **kwargs 

1369 Additional keyword arguments used to augment or construct a 

1370 `DataCoordinate`. See `DataCoordinate.standardize` 

1371 parameters. 

1372 

1373 Returns 

1374 ------- 

1375 uri : `lsst.resources.ResourcePath` 

1376 URI pointing to the Dataset within the datastore. If the 

1377 Dataset does not exist in the datastore, and if ``predict`` is 

1378 `True`, the URI will be a prediction and will include a URI 

1379 fragment "#predicted". 

1380 If the datastore does not have entities that relate well 

1381 to the concept of a URI the returned URI string will be 

1382 descriptive. The returned URI is not guaranteed to be obtainable. 

1383 

1384 Raises 

1385 ------ 

1386 LookupError 

1387 A URI has been requested for a dataset that does not exist and 

1388 guessing is not allowed. 

1389 ValueError 

1390 Raised if a resolved `DatasetRef` was passed as an input, but it 

1391 differs from the one found in the registry. 

1392 TypeError 

1393 Raised if no collections were provided. 

1394 RuntimeError 

1395 Raised if a URI is requested for a dataset that consists of 

1396 multiple artifacts. 

1397 """ 

1398 primary, components = self.getURIs( 

1399 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1400 ) 

1401 

1402 if primary is None or components: 

1403 raise RuntimeError( 

1404 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1405 "Use Butler.getURIs() instead." 

1406 ) 

1407 return primary 

1408 

1409 def retrieveArtifacts( 

1410 self, 

1411 refs: Iterable[DatasetRef], 

1412 destination: ResourcePathExpression, 

1413 transfer: str = "auto", 

1414 preserve_path: bool = True, 

1415 overwrite: bool = False, 

1416 ) -> List[ResourcePath]: 

1417 """Retrieve the artifacts associated with the supplied refs. 

1418 

1419 Parameters 

1420 ---------- 

1421 refs : iterable of `DatasetRef` 

1422 The datasets for which artifacts are to be retrieved. 

1423 A single ref can result in multiple artifacts. The refs must 

1424 be resolved. 

1425 destination : `lsst.resources.ResourcePath` or `str` 

1426 Location to write the artifacts. 

1427 transfer : `str`, optional 

1428 Method to use to transfer the artifacts. Must be one of the options 

1429 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1430 "move" is not allowed. 

1431 preserve_path : `bool`, optional 

1432 If `True` the full path of the artifact within the datastore 

1433 is preserved. If `False` the final file component of the path 

1434 is used. 

1435 overwrite : `bool`, optional 

1436 If `True` allow transfers to overwrite existing files at the 

1437 destination. 

1438 

1439 Returns 

1440 ------- 

1441 targets : `list` of `lsst.resources.ResourcePath` 

1442 URIs of file artifacts in destination location. Order is not 

1443 preserved. 

1444 

1445 Notes 

1446 ----- 

1447 For non-file datastores the artifacts written to the destination 

1448 may not match the representation inside the datastore. For example 

1449 a hierarchical data structure in a NoSQL database may well be stored 

1450 as a JSON file. 

1451 """ 

1452 return self.datastore.retrieveArtifacts( 

1453 refs, 

1454 ResourcePath(destination), 

1455 transfer=transfer, 

1456 preserve_path=preserve_path, 

1457 overwrite=overwrite, 

1458 ) 

1459 

1460 def datasetExists( 

1461 self, 

1462 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1463 dataId: Optional[DataId] = None, 

1464 *, 

1465 collections: Any = None, 

1466 **kwargs: Any, 

1467 ) -> bool: 

1468 """Return True if the Dataset is actually present in the Datastore. 

1469 

1470 Parameters 

1471 ---------- 

1472 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1473 When `DatasetRef` the `dataId` should be `None`. 

1474 Otherwise the `DatasetType` or name thereof. 

1475 dataId : `dict` or `DataCoordinate` 

1476 A `dict` of `Dimension` link name, value pairs that label the 

1477 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1478 should be provided as the first argument. 

1479 collections : Any, optional 

1480 Collections to be searched, overriding ``self.collections``. 

1481 Can be any of the types supported by the ``collections`` argument 

1482 to butler construction. 

1483 **kwargs 

1484 Additional keyword arguments used to augment or construct a 

1485 `DataCoordinate`. See `DataCoordinate.standardize` 

1486 parameters. 

1487 

1488 Raises 

1489 ------ 

1490 LookupError 

1491 Raised if the dataset is not even present in the Registry. 

1492 ValueError 

1493 Raised if a resolved `DatasetRef` was passed as an input, but it 

1494 differs from the one found in the registry. 

1495 TypeError 

1496 Raised if no collections were provided. 

1497 """ 

1498 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1499 return self.datastore.exists(ref) 

1500 

1501 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1502 """Remove one or more `~CollectionType.RUN` collections and the 

1503 datasets within them. 

1504 

1505 Parameters 

1506 ---------- 

1507 names : `Iterable` [ `str` ] 

1508 The names of the collections to remove. 

1509 unstore : `bool`, optional 

1510 If `True` (default), delete datasets from all datastores in which 

1511 they are present, and attempt to rollback the registry deletions if 

1512 datastore deletions fail (which may not always be possible). If 

1513 `False`, datastore records for these datasets are still removed, 

1514 but any artifacts (e.g. files) will not be. 

1515 

1516 Raises 

1517 ------ 

1518 TypeError 

1519 Raised if one or more collections are not of type 

1520 `~CollectionType.RUN`. 

1521 """ 

1522 if not self.isWriteable(): 

1523 raise TypeError("Butler is read-only.") 

1524 names = list(names) 

1525 refs: List[DatasetRef] = [] 

1526 for name in names: 

1527 collectionType = self.registry.getCollectionType(name) 

1528 if collectionType is not CollectionType.RUN: 

1529 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1530 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1531 with self.registry.transaction(): 

1532 if unstore: 

1533 self.datastore.trash(refs) 

1534 else: 

1535 self.datastore.forget(refs) 

1536 for name in names: 

1537 self.registry.removeCollection(name) 

1538 if unstore: 

1539 # Point of no return for removing artifacts 

1540 self.datastore.emptyTrash() 

1541 

1542 def pruneCollection( 

1543 self, name: str, purge: bool = False, unstore: bool = False, unlink: Optional[List[str]] = None 

1544 ) -> None: 

1545 """Remove a collection and possibly prune datasets within it. 

1546 

1547 Parameters 

1548 ---------- 

1549 name : `str` 

1550 Name of the collection to remove. If this is a 

1551 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1552 datasets within the collection are not modified unless ``unstore`` 

1553 is `True`. If this is a `~CollectionType.RUN` collection, 

1554 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1555 are fully removed from the data repository. 

1556 purge : `bool`, optional 

1557 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1558 fully removing datasets within them. Requires ``unstore=True`` as 

1559 well as an added precaution against accidental deletion. Must be 

1560 `False` (default) if the collection is not a ``RUN``. 

1561 unstore: `bool`, optional 

1562 If `True`, remove all datasets in the collection from all 

1563 datastores in which they appear. 

1564 unlink: `list` [`str`], optional 

1565 Before removing the given `collection` unlink it from from these 

1566 parent collections. 

1567 

1568 Raises 

1569 ------ 

1570 TypeError 

1571 Raised if the butler is read-only or arguments are mutually 

1572 inconsistent. 

1573 """ 

1574 # See pruneDatasets comments for more information about the logic here; 

1575 # the cases are almost the same, but here we can rely on Registry to 

1576 # take care everything but Datastore deletion when we remove the 

1577 # collection. 

1578 if not self.isWriteable(): 

1579 raise TypeError("Butler is read-only.") 

1580 collectionType = self.registry.getCollectionType(name) 

1581 if purge and not unstore: 

1582 raise PurgeWithoutUnstorePruneCollectionsError() 

1583 if collectionType is CollectionType.RUN and not purge: 

1584 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1585 if collectionType is not CollectionType.RUN and purge: 

1586 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1587 

1588 def remove(child: str, parent: str) -> None: 

1589 """Remove a child collection from a parent collection.""" 

1590 # Remove child from parent. 

1591 chain = list(self.registry.getCollectionChain(parent)) 

1592 try: 

1593 chain.remove(name) 

1594 except ValueError as e: 

1595 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1596 self.registry.setCollectionChain(parent, chain) 

1597 

1598 with self.registry.transaction(): 

1599 if unlink: 

1600 for parent in unlink: 

1601 remove(name, parent) 

1602 if unstore: 

1603 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1604 self.datastore.trash(refs) 

1605 self.registry.removeCollection(name) 

1606 

1607 if unstore: 

1608 # Point of no return for removing artifacts 

1609 self.datastore.emptyTrash() 

1610 

1611 def pruneDatasets( 

1612 self, 

1613 refs: Iterable[DatasetRef], 

1614 *, 

1615 disassociate: bool = True, 

1616 unstore: bool = False, 

1617 tags: Iterable[str] = (), 

1618 purge: bool = False, 

1619 run: Optional[str] = None, 

1620 ) -> None: 

1621 """Remove one or more datasets from a collection and/or storage. 

1622 

1623 Parameters 

1624 ---------- 

1625 refs : `~collections.abc.Iterable` of `DatasetRef` 

1626 Datasets to prune. These must be "resolved" references (not just 

1627 a `DatasetType` and data ID). 

1628 disassociate : `bool`, optional 

1629 Disassociate pruned datasets from ``tags``, or from all collections 

1630 if ``purge=True``. 

1631 unstore : `bool`, optional 

1632 If `True` (`False` is default) remove these datasets from all 

1633 datastores known to this butler. Note that this will make it 

1634 impossible to retrieve these datasets even via other collections. 

1635 Datasets that are already not stored are ignored by this option. 

1636 tags : `Iterable` [ `str` ], optional 

1637 `~CollectionType.TAGGED` collections to disassociate the datasets 

1638 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1639 `True`. 

1640 purge : `bool`, optional 

1641 If `True` (`False` is default), completely remove the dataset from 

1642 the `Registry`. To prevent accidental deletions, ``purge`` may 

1643 only be `True` if all of the following conditions are met: 

1644 

1645 - All given datasets are in the given run. 

1646 - ``disassociate`` is `True`; 

1647 - ``unstore`` is `True`. 

1648 

1649 This mode may remove provenance information from datasets other 

1650 than those provided, and should be used with extreme care. 

1651 

1652 Raises 

1653 ------ 

1654 TypeError 

1655 Raised if the butler is read-only, if no collection was provided, 

1656 or the conditions for ``purge=True`` were not met. 

1657 """ 

1658 if not self.isWriteable(): 

1659 raise TypeError("Butler is read-only.") 

1660 if purge: 

1661 if not disassociate: 

1662 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1663 if not unstore: 

1664 raise TypeError("Cannot pass purge=True without unstore=True.") 

1665 elif disassociate: 

1666 tags = tuple(tags) 

1667 if not tags: 

1668 raise TypeError("No tags provided but disassociate=True.") 

1669 for tag in tags: 

1670 collectionType = self.registry.getCollectionType(tag) 

1671 if collectionType is not CollectionType.TAGGED: 

1672 raise TypeError( 

1673 f"Cannot disassociate from collection '{tag}' " 

1674 f"of non-TAGGED type {collectionType.name}." 

1675 ) 

1676 # Transform possibly-single-pass iterable into something we can iterate 

1677 # over multiple times. 

1678 refs = list(refs) 

1679 # Pruning a component of a DatasetRef makes no sense since registry 

1680 # doesn't know about components and datastore might not store 

1681 # components in a separate file 

1682 for ref in refs: 

1683 if ref.datasetType.component(): 

1684 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1685 # We don't need an unreliable Datastore transaction for this, because 

1686 # we've been extra careful to ensure that Datastore.trash only involves 

1687 # mutating the Registry (it can _look_ at Datastore-specific things, 

1688 # but shouldn't change them), and hence all operations here are 

1689 # Registry operations. 

1690 with self.registry.transaction(): 

1691 if unstore: 

1692 self.datastore.trash(refs) 

1693 if purge: 

1694 self.registry.removeDatasets(refs) 

1695 elif disassociate: 

1696 assert tags, "Guaranteed by earlier logic in this function." 

1697 for tag in tags: 

1698 self.registry.disassociate(tag, refs) 

1699 # We've exited the Registry transaction, and apparently committed. 

1700 # (if there was an exception, everything rolled back, and it's as if 

1701 # nothing happened - and we never get here). 

1702 # Datastore artifacts are not yet gone, but they're clearly marked 

1703 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1704 # problems we can try again later, and if manual administrative 

1705 # intervention is required, it's pretty clear what that should entail: 

1706 # deleting everything on disk and in private Datastore tables that is 

1707 # in the dataset_location_trash table. 

1708 if unstore: 

1709 # Point of no return for removing artifacts 

1710 self.datastore.emptyTrash() 

1711 

1712 @transactional 

1713 def ingest( 

1714 self, 

1715 *datasets: FileDataset, 

1716 transfer: Optional[str] = "auto", 

1717 run: Optional[str] = None, 

1718 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1719 record_validation_info: bool = True, 

1720 ) -> None: 

1721 """Store and register one or more datasets that already exist on disk. 

1722 

1723 Parameters 

1724 ---------- 

1725 datasets : `FileDataset` 

1726 Each positional argument is a struct containing information about 

1727 a file to be ingested, including its URI (either absolute or 

1728 relative to the datastore root, if applicable), a `DatasetRef`, 

1729 and optionally a formatter class or its fully-qualified string 

1730 name. If a formatter is not provided, the formatter that would be 

1731 used for `put` is assumed. On successful return, all 

1732 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1733 attribute populated and all `FileDataset.formatter` attributes will 

1734 be set to the formatter class used. `FileDataset.path` attributes 

1735 may be modified to put paths in whatever the datastore considers a 

1736 standardized form. 

1737 transfer : `str`, optional 

1738 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1739 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1740 transfer the file. 

1741 run : `str`, optional 

1742 The name of the run ingested datasets should be added to, 

1743 overriding ``self.run``. 

1744 idGenerationMode : `DatasetIdGenEnum`, optional 

1745 Specifies option for generating dataset IDs. By default unique IDs 

1746 are generated for each inserted dataset. 

1747 record_validation_info : `bool`, optional 

1748 If `True`, the default, the datastore can record validation 

1749 information associated with the file. If `False` the datastore 

1750 will not attempt to track any information such as checksums 

1751 or file sizes. This can be useful if such information is tracked 

1752 in an external system or if the file is to be compressed in place. 

1753 It is up to the datastore whether this parameter is relevant. 

1754 

1755 Raises 

1756 ------ 

1757 TypeError 

1758 Raised if the butler is read-only or if no run was provided. 

1759 NotImplementedError 

1760 Raised if the `Datastore` does not support the given transfer mode. 

1761 DatasetTypeNotSupportedError 

1762 Raised if one or more files to be ingested have a dataset type that 

1763 is not supported by the `Datastore`.. 

1764 FileNotFoundError 

1765 Raised if one of the given files does not exist. 

1766 FileExistsError 

1767 Raised if transfer is not `None` but the (internal) location the 

1768 file would be moved to is already occupied. 

1769 

1770 Notes 

1771 ----- 

1772 This operation is not fully exception safe: if a database operation 

1773 fails, the given `FileDataset` instances may be only partially updated. 

1774 

1775 It is atomic in terms of database operations (they will either all 

1776 succeed or all fail) providing the database engine implements 

1777 transactions correctly. It will attempt to be atomic in terms of 

1778 filesystem operations as well, but this cannot be implemented 

1779 rigorously for most datastores. 

1780 """ 

1781 if not self.isWriteable(): 

1782 raise TypeError("Butler is read-only.") 

1783 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1784 # Reorganize the inputs so they're grouped by DatasetType and then 

1785 # data ID. We also include a list of DatasetRefs for each FileDataset 

1786 # to hold the resolved DatasetRefs returned by the Registry, before 

1787 # it's safe to swap them into FileDataset.refs. 

1788 # Some type annotation aliases to make that clearer: 

1789 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1790 GroupedData = MutableMapping[DatasetType, GroupForType] 

1791 # The actual data structure: 

1792 groupedData: GroupedData = defaultdict(dict) 

1793 # And the nested loop that populates it: 

1794 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1795 # This list intentionally shared across the inner loop, since it's 

1796 # associated with `dataset`. 

1797 resolvedRefs: List[DatasetRef] = [] 

1798 

1799 # Somewhere to store pre-existing refs if we have an 

1800 # execution butler. 

1801 existingRefs: List[DatasetRef] = [] 

1802 

1803 for ref in dataset.refs: 

1804 if ref.dataId in groupedData[ref.datasetType]: 

1805 raise ConflictingDefinitionError( 

1806 f"Ingest conflict. Dataset {dataset.path} has same" 

1807 " DataId as other ingest dataset" 

1808 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1809 f" ({ref.dataId})" 

1810 ) 

1811 if self._allow_put_of_predefined_dataset: 

1812 existing_ref = self.registry.findDataset( 

1813 ref.datasetType, dataId=ref.dataId, collections=run 

1814 ) 

1815 if existing_ref: 

1816 if self.datastore.knows(existing_ref): 

1817 raise ConflictingDefinitionError( 

1818 f"Dataset associated with path {dataset.path}" 

1819 f" already exists as {existing_ref}." 

1820 ) 

1821 # Store this ref elsewhere since it already exists 

1822 # and we do not want to remake it but we do want 

1823 # to store it in the datastore. 

1824 existingRefs.append(existing_ref) 

1825 

1826 # Nothing else to do until we have finished 

1827 # iterating. 

1828 continue 

1829 

1830 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1831 

1832 if existingRefs: 

1833 

1834 if len(dataset.refs) != len(existingRefs): 

1835 # Keeping track of partially pre-existing datasets is hard 

1836 # and should generally never happen. For now don't allow 

1837 # it. 

1838 raise ConflictingDefinitionError( 

1839 f"For dataset {dataset.path} some dataIds already exist" 

1840 " in registry but others do not. This is not supported." 

1841 ) 

1842 

1843 # Attach the resolved refs if we found them. 

1844 dataset.refs = existingRefs 

1845 

1846 # Now we can bulk-insert into Registry for each DatasetType. 

1847 for datasetType, groupForType in progress.iter_item_chunks( 

1848 groupedData.items(), desc="Bulk-inserting datasets by type" 

1849 ): 

1850 refs = self.registry.insertDatasets( 

1851 datasetType, 

1852 dataIds=groupForType.keys(), 

1853 run=run, 

1854 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1855 idGenerationMode=idGenerationMode, 

1856 ) 

1857 # Append those resolved DatasetRefs to the new lists we set up for 

1858 # them. 

1859 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1860 resolvedRefs.append(ref) 

1861 

1862 # Go back to the original FileDatasets to replace their refs with the 

1863 # new resolved ones. 

1864 for groupForType in progress.iter_chunks( 

1865 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1866 ): 

1867 for dataset, resolvedRefs in groupForType.values(): 

1868 dataset.refs = resolvedRefs 

1869 

1870 # Bulk-insert everything into Datastore. 

1871 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1872 

1873 @contextlib.contextmanager 

1874 def export( 

1875 self, 

1876 *, 

1877 directory: Optional[str] = None, 

1878 filename: Optional[str] = None, 

1879 format: Optional[str] = None, 

1880 transfer: Optional[str] = None, 

1881 ) -> Iterator[RepoExportContext]: 

1882 """Export datasets from the repository represented by this `Butler`. 

1883 

1884 This method is a context manager that returns a helper object 

1885 (`RepoExportContext`) that is used to indicate what information from 

1886 the repository should be exported. 

1887 

1888 Parameters 

1889 ---------- 

1890 directory : `str`, optional 

1891 Directory dataset files should be written to if ``transfer`` is not 

1892 `None`. 

1893 filename : `str`, optional 

1894 Name for the file that will include database information associated 

1895 with the exported datasets. If this is not an absolute path and 

1896 ``directory`` is not `None`, it will be written to ``directory`` 

1897 instead of the current working directory. Defaults to 

1898 "export.{format}". 

1899 format : `str`, optional 

1900 File format for the database information file. If `None`, the 

1901 extension of ``filename`` will be used. 

1902 transfer : `str`, optional 

1903 Transfer mode passed to `Datastore.export`. 

1904 

1905 Raises 

1906 ------ 

1907 TypeError 

1908 Raised if the set of arguments passed is inconsistent. 

1909 

1910 Examples 

1911 -------- 

1912 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1913 methods are used to provide the iterables over data IDs and/or datasets 

1914 to be exported:: 

1915 

1916 with butler.export("exports.yaml") as export: 

1917 # Export all flats, but none of the dimension element rows 

1918 # (i.e. data ID information) associated with them. 

1919 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1920 elements=()) 

1921 # Export all datasets that start with "deepCoadd_" and all of 

1922 # their associated data ID information. 

1923 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1924 """ 

1925 if directory is None and transfer is not None: 

1926 raise TypeError("Cannot transfer without providing a directory.") 

1927 if transfer == "move": 

1928 raise TypeError("Transfer may not be 'move': export is read-only") 

1929 if format is None: 

1930 if filename is None: 

1931 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1932 else: 

1933 _, format = os.path.splitext(filename) 

1934 elif filename is None: 

1935 filename = f"export.{format}" 

1936 if directory is not None: 

1937 filename = os.path.join(directory, filename) 

1938 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["export"]) 

1939 with open(filename, "w") as stream: 

1940 backend = BackendClass(stream) 

1941 try: 

1942 helper = RepoExportContext( 

1943 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

1944 ) 

1945 yield helper 

1946 except BaseException: 

1947 raise 

1948 else: 

1949 helper._finish() 

1950 

1951 def import_( 

1952 self, 

1953 *, 

1954 directory: Optional[str] = None, 

1955 filename: Union[str, TextIO, None] = None, 

1956 format: Optional[str] = None, 

1957 transfer: Optional[str] = None, 

1958 skip_dimensions: Optional[Set] = None, 

1959 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1960 reuseIds: bool = False, 

1961 ) -> None: 

1962 """Import datasets into this repository that were exported from a 

1963 different butler repository via `~lsst.daf.butler.Butler.export`. 

1964 

1965 Parameters 

1966 ---------- 

1967 directory : `str`, optional 

1968 Directory containing dataset files to import from. If `None`, 

1969 ``filename`` and all dataset file paths specified therein must 

1970 be absolute. 

1971 filename : `str` or `TextIO`, optional 

1972 A stream or name of file that contains database information 

1973 associated with the exported datasets, typically generated by 

1974 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1975 is not an absolute path, does not exist in the current working 

1976 directory, and ``directory`` is not `None`, it is assumed to be in 

1977 ``directory``. Defaults to "export.{format}". 

1978 format : `str`, optional 

1979 File format for ``filename``. If `None`, the extension of 

1980 ``filename`` will be used. 

1981 transfer : `str`, optional 

1982 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1983 skip_dimensions : `set`, optional 

1984 Names of dimensions that should be skipped and not imported. 

1985 idGenerationMode : `DatasetIdGenEnum`, optional 

1986 Specifies option for generating dataset IDs when IDs are not 

1987 provided or their type does not match backend type. By default 

1988 unique IDs are generated for each inserted dataset. 

1989 reuseIds : `bool`, optional 

1990 If `True` then forces re-use of imported dataset IDs for integer 

1991 IDs which are normally generated as auto-incremented; exception 

1992 will be raised if imported IDs clash with existing ones. This 

1993 option has no effect on the use of globally-unique IDs which are 

1994 always re-used (or generated if integer IDs are being imported). 

1995 

1996 Raises 

1997 ------ 

1998 TypeError 

1999 Raised if the set of arguments passed is inconsistent, or if the 

2000 butler is read-only. 

2001 """ 

2002 if not self.isWriteable(): 

2003 raise TypeError("Butler is read-only.") 

2004 if format is None: 

2005 if filename is None: 

2006 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2007 else: 

2008 _, format = os.path.splitext(filename) # type: ignore 

2009 elif filename is None: 

2010 filename = f"export.{format}" 

2011 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

2012 filename = os.path.join(directory, filename) 

2013 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2014 

2015 def doImport(importStream: TextIO) -> None: 

2016 backend = BackendClass(importStream, self.registry) 

2017 backend.register() 

2018 with self.transaction(): 

2019 backend.load( 

2020 self.datastore, 

2021 directory=directory, 

2022 transfer=transfer, 

2023 skip_dimensions=skip_dimensions, 

2024 idGenerationMode=idGenerationMode, 

2025 reuseIds=reuseIds, 

2026 ) 

2027 

2028 if isinstance(filename, str): 

2029 with open(filename, "r") as stream: 

2030 doImport(stream) 

2031 else: 

2032 doImport(filename) 

2033 

2034 def transfer_from( 

2035 self, 

2036 source_butler: Butler, 

2037 source_refs: Iterable[DatasetRef], 

2038 transfer: str = "auto", 

2039 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

2040 skip_missing: bool = True, 

2041 register_dataset_types: bool = False, 

2042 ) -> List[DatasetRef]: 

2043 """Transfer datasets to this Butler from a run in another Butler. 

2044 

2045 Parameters 

2046 ---------- 

2047 source_butler : `Butler` 

2048 Butler from which the datasets are to be transferred. 

2049 source_refs : iterable of `DatasetRef` 

2050 Datasets defined in the source butler that should be transferred to 

2051 this butler. 

2052 transfer : `str`, optional 

2053 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2054 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

2055 A mapping of dataset type to ID generation mode. Only used if 

2056 the source butler is using integer IDs. Should not be used 

2057 if this receiving butler uses integer IDs. Without this dataset 

2058 import always uses unique. 

2059 skip_missing : `bool` 

2060 If `True`, datasets with no datastore artifact associated with 

2061 them are not transferred. If `False` a registry entry will be 

2062 created even if no datastore record is created (and so will 

2063 look equivalent to the dataset being unstored). 

2064 register_dataset_types : `bool` 

2065 If `True` any missing dataset types are registered. Otherwise 

2066 an exception is raised. 

2067 

2068 Returns 

2069 ------- 

2070 refs : `list` of `DatasetRef` 

2071 The refs added to this Butler. 

2072 

2073 Notes 

2074 ----- 

2075 Requires that any dimension definitions are already present in the 

2076 receiving Butler. The datastore artifact has to exist for a transfer 

2077 to be made but non-existence is not an error. 

2078 

2079 Datasets that already exist in this run will be skipped. 

2080 

2081 The datasets are imported as part of a transaction, although 

2082 dataset types are registered before the transaction is started. 

2083 This means that it is possible for a dataset type to be registered 

2084 even though transfer has failed. 

2085 """ 

2086 if not self.isWriteable(): 

2087 raise TypeError("Butler is read-only.") 

2088 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2089 

2090 # Will iterate through the refs multiple times so need to convert 

2091 # to a list if this isn't a collection. 

2092 if not isinstance(source_refs, collections.abc.Collection): 

2093 source_refs = list(source_refs) 

2094 

2095 original_count = len(source_refs) 

2096 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2097 

2098 if id_gen_map is None: 

2099 id_gen_map = {} 

2100 

2101 # In some situations the datastore artifact may be missing 

2102 # and we do not want that registry entry to be imported. 

2103 # Asking datastore is not sufficient, the records may have been 

2104 # purged, we have to ask for the (predicted) URI and check 

2105 # existence explicitly. Execution butler is set up exactly like 

2106 # this with no datastore records. 

2107 artifact_existence: Dict[ResourcePath, bool] = {} 

2108 if skip_missing: 

2109 dataset_existence = source_butler.datastore.mexists( 

2110 source_refs, artifact_existence=artifact_existence 

2111 ) 

2112 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2113 filtered_count = len(source_refs) 

2114 log.verbose( 

2115 "%d datasets removed because the artifact does not exist. Now have %d.", 

2116 original_count - filtered_count, 

2117 filtered_count, 

2118 ) 

2119 

2120 # Importing requires that we group the refs by dataset type and run 

2121 # before doing the import. 

2122 source_dataset_types = set() 

2123 grouped_refs = defaultdict(list) 

2124 grouped_indices = defaultdict(list) 

2125 for i, ref in enumerate(source_refs): 

2126 grouped_refs[ref.datasetType, ref.run].append(ref) 

2127 grouped_indices[ref.datasetType, ref.run].append(i) 

2128 source_dataset_types.add(ref.datasetType) 

2129 

2130 # Check to see if the dataset type in the source butler has 

2131 # the same definition in the target butler and register missing 

2132 # ones if requested. Registration must happen outside a transaction. 

2133 newly_registered_dataset_types = set() 

2134 for datasetType in source_dataset_types: 

2135 if register_dataset_types: 

2136 # Let this raise immediately if inconsistent. Continuing 

2137 # on to find additional inconsistent dataset types 

2138 # might result in additional unwanted dataset types being 

2139 # registered. 

2140 if self.registry.registerDatasetType(datasetType): 

2141 newly_registered_dataset_types.add(datasetType) 

2142 else: 

2143 # If the dataset type is missing, let it fail immediately. 

2144 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2145 if target_dataset_type != datasetType: 

2146 raise ConflictingDefinitionError( 

2147 "Source butler dataset type differs from definition" 

2148 f" in target butler: {datasetType} !=" 

2149 f" {target_dataset_type}" 

2150 ) 

2151 if newly_registered_dataset_types: 

2152 # We may have registered some even if there were inconsistencies 

2153 # but should let people know (or else remove them again). 

2154 log.log( 

2155 VERBOSE, 

2156 "Registered the following dataset types in the target Butler: %s", 

2157 ", ".join(d.name for d in newly_registered_dataset_types), 

2158 ) 

2159 else: 

2160 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2161 

2162 # The returned refs should be identical for UUIDs. 

2163 # For now must also support integers and so need to retain the 

2164 # newly-created refs from this registry. 

2165 # Pre-size it so we can assign refs into the correct slots 

2166 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

2167 default_id_gen = DatasetIdGenEnum.UNIQUE 

2168 

2169 handled_collections: Set[str] = set() 

2170 

2171 # Do all the importing in a single transaction. 

2172 with self.transaction(): 

2173 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2174 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2175 ): 

2176 if run not in handled_collections: 

2177 run_doc = source_butler.registry.getCollectionDocumentation(run) 

2178 registered = self.registry.registerRun(run, doc=run_doc) 

2179 handled_collections.add(run) 

2180 if registered: 

2181 log.log(VERBOSE, "Creating output run %s", run) 

2182 

2183 id_generation_mode = default_id_gen 

2184 if isinstance(refs_to_import[0].id, int): 

2185 # ID generation mode might need to be overridden when 

2186 # targetting UUID 

2187 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

2188 

2189 n_refs = len(refs_to_import) 

2190 log.verbose( 

2191 "Importing %d ref%s of dataset type %s into run %s", 

2192 n_refs, 

2193 "" if n_refs == 1 else "s", 

2194 datasetType.name, 

2195 run, 

2196 ) 

2197 

2198 # No way to know if this butler's registry uses UUID. 

2199 # We have to trust the caller on this. If it fails they will 

2200 # have to change their approach. We can't catch the exception 

2201 # and retry with unique because that will mess up the 

2202 # transaction handling. We aren't allowed to ask the registry 

2203 # manager what type of ID it is using. 

2204 imported_refs = self.registry._importDatasets( 

2205 refs_to_import, idGenerationMode=id_generation_mode, expand=False 

2206 ) 

2207 

2208 # Map them into the correct slots to match the initial order 

2209 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

2210 transferred_refs_tmp[i] = ref 

2211 

2212 # Mypy insists that we might have None in here so we have to make 

2213 # that explicit by assigning to a new variable and filtering out 

2214 # something that won't be there. 

2215 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

2216 

2217 # Check consistency 

2218 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

2219 

2220 log.verbose("Imported %d datasets into destination butler", len(transferred_refs)) 

2221 

2222 # The transferred refs need to be reordered to match the original 

2223 # ordering given by the caller. Without this the datastore transfer 

2224 # will be broken. 

2225 

2226 # Ask the datastore to transfer. The datastore has to check that 

2227 # the source datastore is compatible with the target datastore. 

2228 self.datastore.transfer_from( 

2229 source_butler.datastore, 

2230 source_refs, 

2231 local_refs=transferred_refs, 

2232 transfer=transfer, 

2233 artifact_existence=artifact_existence, 

2234 ) 

2235 

2236 return transferred_refs 

2237 

2238 def validateConfiguration( 

2239 self, 

2240 logFailures: bool = False, 

2241 datasetTypeNames: Optional[Iterable[str]] = None, 

2242 ignore: Iterable[str] = None, 

2243 ) -> None: 

2244 """Validate butler configuration. 

2245 

2246 Checks that each `DatasetType` can be stored in the `Datastore`. 

2247 

2248 Parameters 

2249 ---------- 

2250 logFailures : `bool`, optional 

2251 If `True`, output a log message for every validation error 

2252 detected. 

2253 datasetTypeNames : iterable of `str`, optional 

2254 The `DatasetType` names that should be checked. This allows 

2255 only a subset to be selected. 

2256 ignore : iterable of `str`, optional 

2257 Names of DatasetTypes to skip over. This can be used to skip 

2258 known problems. If a named `DatasetType` corresponds to a 

2259 composite, all components of that `DatasetType` will also be 

2260 ignored. 

2261 

2262 Raises 

2263 ------ 

2264 ButlerValidationError 

2265 Raised if there is some inconsistency with how this Butler 

2266 is configured. 

2267 """ 

2268 if datasetTypeNames: 

2269 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2270 else: 

2271 datasetTypes = list(self.registry.queryDatasetTypes()) 

2272 

2273 # filter out anything from the ignore list 

2274 if ignore: 

2275 ignore = set(ignore) 

2276 datasetTypes = [ 

2277 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2278 ] 

2279 else: 

2280 ignore = set() 

2281 

2282 # Find all the registered instruments 

2283 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2284 

2285 # For each datasetType that has an instrument dimension, create 

2286 # a DatasetRef for each defined instrument 

2287 datasetRefs = [] 

2288 

2289 for datasetType in datasetTypes: 

2290 if "instrument" in datasetType.dimensions: 

2291 for instrument in instruments: 

2292 datasetRef = DatasetRef( 

2293 datasetType, {"instrument": instrument}, conform=False # type: ignore 

2294 ) 

2295 datasetRefs.append(datasetRef) 

2296 

2297 entities: List[Union[DatasetType, DatasetRef]] = [] 

2298 entities.extend(datasetTypes) 

2299 entities.extend(datasetRefs) 

2300 

2301 datastoreErrorStr = None 

2302 try: 

2303 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2304 except ValidationError as e: 

2305 datastoreErrorStr = str(e) 

2306 

2307 # Also check that the LookupKeys used by the datastores match 

2308 # registry and storage class definitions 

2309 keys = self.datastore.getLookupKeys() 

2310 

2311 failedNames = set() 

2312 failedDataId = set() 

2313 for key in keys: 

2314 if key.name is not None: 

2315 if key.name in ignore: 

2316 continue 

2317 

2318 # skip if specific datasetType names were requested and this 

2319 # name does not match 

2320 if datasetTypeNames and key.name not in datasetTypeNames: 

2321 continue 

2322 

2323 # See if it is a StorageClass or a DatasetType 

2324 if key.name in self.storageClasses: 

2325 pass 

2326 else: 

2327 try: 

2328 self.registry.getDatasetType(key.name) 

2329 except KeyError: 

2330 if logFailures: 

2331 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2332 failedNames.add(key) 

2333 else: 

2334 # Dimensions are checked for consistency when the Butler 

2335 # is created and rendezvoused with a universe. 

2336 pass 

2337 

2338 # Check that the instrument is a valid instrument 

2339 # Currently only support instrument so check for that 

2340 if key.dataId: 

2341 dataIdKeys = set(key.dataId) 

2342 if set(["instrument"]) != dataIdKeys: 

2343 if logFailures: 

2344 log.critical("Key '%s' has unsupported DataId override", key) 

2345 failedDataId.add(key) 

2346 elif key.dataId["instrument"] not in instruments: 

2347 if logFailures: 

2348 log.critical("Key '%s' has unknown instrument", key) 

2349 failedDataId.add(key) 

2350 

2351 messages = [] 

2352 

2353 if datastoreErrorStr: 

2354 messages.append(datastoreErrorStr) 

2355 

2356 for failed, msg in ( 

2357 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2358 (failedDataId, "Keys with bad DataId entries: "), 

2359 ): 

2360 if failed: 

2361 msg += ", ".join(str(k) for k in failed) 

2362 messages.append(msg) 

2363 

2364 if messages: 

2365 raise ValidationError(";\n".join(messages)) 

2366 

2367 @property 

2368 def collections(self) -> CollectionSearch: 

2369 """The collections to search by default, in order (`CollectionSearch`). 

2370 

2371 This is an alias for ``self.registry.defaults.collections``. It cannot 

2372 be set directly in isolation, but all defaults may be changed together 

2373 by assigning a new `RegistryDefaults` instance to 

2374 ``self.registry.defaults``. 

2375 """ 

2376 return self.registry.defaults.collections 

2377 

2378 @property 

2379 def run(self) -> Optional[str]: 

2380 """Name of the run this butler writes outputs to by default (`str` or 

2381 `None`). 

2382 

2383 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2384 directly in isolation, but all defaults may be changed together by 

2385 assigning a new `RegistryDefaults` instance to 

2386 ``self.registry.defaults``. 

2387 """ 

2388 return self.registry.defaults.run 

2389 

2390 registry: Registry 

2391 """The object that manages dataset metadata and relationships (`Registry`). 

2392 

2393 Most operations that don't involve reading or writing butler datasets are 

2394 accessible only via `Registry` methods. 

2395 """ 

2396 

2397 datastore: Datastore 

2398 """The object that manages actual dataset storage (`Datastore`). 

2399 

2400 Direct user access to the datastore should rarely be necessary; the primary 

2401 exception is the case where a `Datastore` implementation provides extra 

2402 functionality beyond what the base class defines. 

2403 """ 

2404 

2405 storageClasses: StorageClassFactory 

2406 """An object that maps known storage class names to objects that fully 

2407 describe them (`StorageClassFactory`). 

2408 """ 

2409 

2410 _allow_put_of_predefined_dataset: bool 

2411 """Allow a put to succeed even if there is already a registry entry for it 

2412 but not a datastore record. (`bool`)."""