Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImport 

65from .core import ( 

66 AmbiguousDatasetError, 

67 ButlerURI, 

68 Config, 

69 ConfigSubset, 

70 DataCoordinate, 

71 DataId, 

72 DataIdValue, 

73 DatasetRef, 

74 DatasetType, 

75 Datastore, 

76 Dimension, 

77 DimensionConfig, 

78 FileDataset, 

79 Progress, 

80 StorageClassFactory, 

81 Timespan, 

82 ValidationError, 

83 VERBOSE, 

84) 

85from .core.repoRelocation import BUTLER_ROOT_TAG 

86from .core.utils import transactional, getClassOf 

87from ._deferredDatasetHandle import DeferredDatasetHandle 

88from ._butlerConfig import ButlerConfig 

89from .registry import ( 

90 Registry, 

91 RegistryConfig, 

92 RegistryDefaults, 

93 CollectionSearch, 

94 CollectionType, 

95 ConflictingDefinitionError, 

96 DatasetIdGenEnum, 

97) 

98from .transfers import RepoExportContext 

99 

100log = logging.getLogger(__name__) 

101 

102 

103class ButlerValidationError(ValidationError): 

104 """There is a problem with the Butler configuration.""" 

105 pass 

106 

107 

108class PruneCollectionsArgsError(TypeError): 

109 """Base class for errors relating to Butler.pruneCollections input 

110 arguments. 

111 """ 

112 pass 

113 

114 

115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

116 """Raised when purge and unstore are both required to be True, and 

117 purge is True but unstore is False. 

118 """ 

119 

120 def __init__(self) -> None: 

121 super().__init__("Cannot pass purge=True without unstore=True.") 

122 

123 

124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

125 """Raised when pruning a RUN collection but purge is False.""" 

126 

127 def __init__(self, collectionType: CollectionType): 

128 self.collectionType = collectionType 

129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

130 

131 

132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

133 """Raised when purge is True but is not supported for the given 

134 collection.""" 

135 

136 def __init__(self, collectionType: CollectionType): 

137 self.collectionType = collectionType 

138 super().__init__( 

139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

140 

141 

142class Butler: 

143 """Main entry point for the data access system. 

144 

145 Parameters 

146 ---------- 

147 config : `ButlerConfig`, `Config` or `str`, optional. 

148 Configuration. Anything acceptable to the 

149 `ButlerConfig` constructor. If a directory path 

150 is given the configuration will be read from a ``butler.yaml`` file in 

151 that location. If `None` is given default values will be used. 

152 butler : `Butler`, optional. 

153 If provided, construct a new Butler that uses the same registry and 

154 datastore as the given one, but with the given collection and run. 

155 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

156 arguments. 

157 collections : `str` or `Iterable` [ `str` ], optional 

158 An expression specifying the collections to be searched (in order) when 

159 reading datasets. 

160 This may be a `str` collection name or an iterable thereof. 

161 See :ref:`daf_butler_collection_expressions` for more information. 

162 These collections are not registered automatically and must be 

163 manually registered before they are used by any method, but they may be 

164 manually registered after the `Butler` is initialized. 

165 run : `str`, optional 

166 Name of the `~CollectionType.RUN` collection new datasets should be 

167 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

168 ``collections`` will be set to ``[run]``. If not `None`, this 

169 collection will automatically be registered. If this is not set (and 

170 ``writeable`` is not set either), a read-only butler will be created. 

171 searchPaths : `list` of `str`, optional 

172 Directory paths to search when calculating the full Butler 

173 configuration. Not used if the supplied config is already a 

174 `ButlerConfig`. 

175 writeable : `bool`, optional 

176 Explicitly sets whether the butler supports write operations. If not 

177 provided, a read-write butler is created if any of ``run``, ``tags``, 

178 or ``chains`` is non-empty. 

179 inferDefaults : `bool`, optional 

180 If `True` (default) infer default data ID values from the values 

181 present in the datasets in ``collections``: if all collections have the 

182 same value (or no value) for a governor dimension, that value will be 

183 the default for that dimension. Nonexistent collections are ignored. 

184 If a default value is provided explicitly for a governor dimension via 

185 ``**kwargs``, no default will be inferred for that dimension. 

186 **kwargs : `str` 

187 Default data ID key-value pairs. These may only identify "governor" 

188 dimensions like ``instrument`` and ``skymap``. 

189 

190 Examples 

191 -------- 

192 While there are many ways to control exactly how a `Butler` interacts with 

193 the collections in its `Registry`, the most common cases are still simple. 

194 

195 For a read-only `Butler` that searches one collection, do:: 

196 

197 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

198 

199 For a read-write `Butler` that writes to and reads from a 

200 `~CollectionType.RUN` collection:: 

201 

202 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

203 

204 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

205 because we want to write to one `~CollectionType.RUN` collection but read 

206 from several others (as well):: 

207 

208 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

209 collections=["u/alice/DM-50000/a", 

210 "u/bob/DM-49998", 

211 "HSC/defaults"]) 

212 

213 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

214 Datasets will be read first from that run (since it appears first in the 

215 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

216 

217 Finally, one can always create a `Butler` with no collections:: 

218 

219 butler = Butler("/path/to/repo", writeable=True) 

220 

221 This can be extremely useful when you just want to use ``butler.registry``, 

222 e.g. for inserting dimension data or managing collections, or when the 

223 collections you want to use with the butler are not consistent. 

224 Passing ``writeable`` explicitly here is only necessary if you want to be 

225 able to make changes to the repo - usually the value for ``writeable`` can 

226 be guessed from the collection arguments provided, but it defaults to 

227 `False` when there are not collection arguments. 

228 """ 

229 def __init__(self, config: Union[Config, str, None] = None, *, 

230 butler: Optional[Butler] = None, 

231 collections: Any = None, 

232 run: Optional[str] = None, 

233 searchPaths: Optional[List[str]] = None, 

234 writeable: Optional[bool] = None, 

235 inferDefaults: bool = True, 

236 **kwargs: str, 

237 ): 

238 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

239 # Load registry, datastore, etc. from config or existing butler. 

240 if butler is not None: 

241 if config is not None or searchPaths is not None or writeable is not None: 

242 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

243 "arguments with 'butler' argument.") 

244 self.registry = butler.registry.copy(defaults) 

245 self.datastore = butler.datastore 

246 self.storageClasses = butler.storageClasses 

247 self._config: ButlerConfig = butler._config 

248 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

249 else: 

250 self._config = ButlerConfig(config, searchPaths=searchPaths) 

251 try: 

252 if "root" in self._config: 

253 butlerRoot = self._config["root"] 

254 else: 

255 butlerRoot = self._config.configDir 

256 if writeable is None: 

257 writeable = run is not None 

258 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

259 defaults=defaults) 

260 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

261 butlerRoot=butlerRoot) 

262 self.storageClasses = StorageClassFactory() 

263 self.storageClasses.addFromConfig(self._config) 

264 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset", 

265 False) 

266 except Exception: 

267 # Failures here usually mean that configuration is incomplete, 

268 # just issue an error message which includes config file URI. 

269 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

270 raise 

271 

272 if "run" in self._config or "collection" in self._config: 

273 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

274 

275 GENERATION: ClassVar[int] = 3 

276 """This is a Generation 3 Butler. 

277 

278 This attribute may be removed in the future, once the Generation 2 Butler 

279 interface has been fully retired; it should only be used in transitional 

280 code. 

281 """ 

282 

283 @staticmethod 

284 def makeRepo(root: str, config: Union[Config, str, None] = None, 

285 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

286 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

287 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

288 """Create an empty data repository by adding a butler.yaml config 

289 to a repository root directory. 

290 

291 Parameters 

292 ---------- 

293 root : `str` or `ButlerURI` 

294 Path or URI to the root location of the new repository. Will be 

295 created if it does not exist. 

296 config : `Config` or `str`, optional 

297 Configuration to write to the repository, after setting any 

298 root-dependent Registry or Datastore config options. Can not 

299 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

300 configuration will be used. Root-dependent config options 

301 specified in this config are overwritten if ``forceConfigRoot`` 

302 is `True`. 

303 dimensionConfig : `Config` or `str`, optional 

304 Configuration for dimensions, will be used to initialize registry 

305 database. 

306 standalone : `bool` 

307 If True, write all expanded defaults, not just customized or 

308 repository-specific settings. 

309 This (mostly) decouples the repository from the default 

310 configuration, insulating it from changes to the defaults (which 

311 may be good or bad, depending on the nature of the changes). 

312 Future *additions* to the defaults will still be picked up when 

313 initializing `Butlers` to repos created with ``standalone=True``. 

314 searchPaths : `list` of `str`, optional 

315 Directory paths to search when calculating the full butler 

316 configuration. 

317 forceConfigRoot : `bool`, optional 

318 If `False`, any values present in the supplied ``config`` that 

319 would normally be reset are not overridden and will appear 

320 directly in the output config. This allows non-standard overrides 

321 of the root directory for a datastore or registry to be given. 

322 If this parameter is `True` the values for ``root`` will be 

323 forced into the resulting config if appropriate. 

324 outfile : `str`, optional 

325 If not-`None`, the output configuration will be written to this 

326 location rather than into the repository itself. Can be a URI 

327 string. Can refer to a directory that will be used to write 

328 ``butler.yaml``. 

329 overwrite : `bool`, optional 

330 Create a new configuration file even if one already exists 

331 in the specified output location. Default is to raise 

332 an exception. 

333 

334 Returns 

335 ------- 

336 config : `Config` 

337 The updated `Config` instance written to the repo. 

338 

339 Raises 

340 ------ 

341 ValueError 

342 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

343 regular Config (as these subclasses would make it impossible to 

344 support ``standalone=False``). 

345 FileExistsError 

346 Raised if the output config file already exists. 

347 os.error 

348 Raised if the directory does not exist, exists but is not a 

349 directory, or cannot be created. 

350 

351 Notes 

352 ----- 

353 Note that when ``standalone=False`` (the default), the configuration 

354 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

355 construct the repository should also be used to construct any Butlers 

356 to avoid configuration inconsistencies. 

357 """ 

358 if isinstance(config, (ButlerConfig, ConfigSubset)): 

359 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

360 

361 # Ensure that the root of the repository exists or can be made 

362 uri = ButlerURI(root, forceDirectory=True) 

363 uri.mkdir() 

364 

365 config = Config(config) 

366 

367 # If we are creating a new repo from scratch with relative roots, 

368 # do not propagate an explicit root from the config file 

369 if "root" in config: 

370 del config["root"] 

371 

372 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

373 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"]) 

374 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

375 

376 # if key exists in given config, parse it, otherwise parse the defaults 

377 # in the expanded config 

378 if config.get(("registry", "db")): 

379 registryConfig = RegistryConfig(config) 

380 else: 

381 registryConfig = RegistryConfig(full) 

382 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

383 if defaultDatabaseUri is not None: 

384 Config.updateParameters(RegistryConfig, config, full, 

385 toUpdate={"db": defaultDatabaseUri}, 

386 overwrite=forceConfigRoot) 

387 else: 

388 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

389 overwrite=forceConfigRoot) 

390 

391 if standalone: 

392 config.merge(full) 

393 else: 

394 # Always expand the registry.managers section into the per-repo 

395 # config, because after the database schema is created, it's not 

396 # allowed to change anymore. Note that in the standalone=True 

397 # branch, _everything_ in the config is expanded, so there's no 

398 # need to special case this. 

399 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

400 configURI: Union[str, ButlerURI] 

401 if outfile is not None: 

402 # When writing to a separate location we must include 

403 # the root of the butler repo in the config else it won't know 

404 # where to look. 

405 config["root"] = uri.geturl() 

406 configURI = outfile 

407 else: 

408 configURI = uri 

409 config.dumpToUri(configURI, overwrite=overwrite) 

410 

411 # Create Registry and populate tables 

412 registryConfig = RegistryConfig(config.get("registry")) 

413 dimensionConfig = DimensionConfig(dimensionConfig) 

414 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

415 

416 log.log(VERBOSE, "Wrote new Butler configuration file to %s", configURI) 

417 

418 return config 

419 

420 @classmethod 

421 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

422 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

423 """Callable used to unpickle a Butler. 

424 

425 We prefer not to use ``Butler.__init__`` directly so we can force some 

426 of its many arguments to be keyword-only (note that ``__reduce__`` 

427 can only invoke callables with positional arguments). 

428 

429 Parameters 

430 ---------- 

431 config : `ButlerConfig` 

432 Butler configuration, already coerced into a true `ButlerConfig` 

433 instance (and hence after any search paths for overrides have been 

434 utilized). 

435 collections : `CollectionSearch` 

436 Names of the default collections to read from. 

437 run : `str`, optional 

438 Name of the default `~CollectionType.RUN` collection to write to. 

439 defaultDataId : `dict` [ `str`, `str` ] 

440 Default data ID values. 

441 writeable : `bool` 

442 Whether the Butler should support write operations. 

443 

444 Returns 

445 ------- 

446 butler : `Butler` 

447 A new `Butler` instance. 

448 """ 

449 # MyPy doesn't recognize that the kwargs below are totally valid; it 

450 # seems to think '**defaultDataId* is a _positional_ argument! 

451 return cls(config=config, collections=collections, run=run, writeable=writeable, 

452 **defaultDataId) # type: ignore 

453 

454 def __reduce__(self) -> tuple: 

455 """Support pickling. 

456 """ 

457 return (Butler._unpickle, (self._config, self.collections, self.run, 

458 self.registry.defaults.dataId.byName(), 

459 self.registry.isWriteable())) 

460 

461 def __str__(self) -> str: 

462 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

463 self.collections, self.run, self.datastore, self.registry) 

464 

465 def isWriteable(self) -> bool: 

466 """Return `True` if this `Butler` supports write operations. 

467 """ 

468 return self.registry.isWriteable() 

469 

470 @contextlib.contextmanager 

471 def transaction(self) -> Iterator[None]: 

472 """Context manager supporting `Butler` transactions. 

473 

474 Transactions can be nested. 

475 """ 

476 with self.registry.transaction(): 

477 with self.datastore.transaction(): 

478 yield 

479 

480 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

481 dataId: Optional[DataId] = None, **kwargs: Any 

482 ) -> Tuple[DatasetType, Optional[DataId]]: 

483 """Standardize the arguments passed to several Butler APIs. 

484 

485 Parameters 

486 ---------- 

487 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

488 When `DatasetRef` the `dataId` should be `None`. 

489 Otherwise the `DatasetType` or name thereof. 

490 dataId : `dict` or `DataCoordinate` 

491 A `dict` of `Dimension` link name, value pairs that label the 

492 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

493 should be provided as the second argument. 

494 **kwargs 

495 Additional keyword arguments used to augment or construct a 

496 `DataCoordinate`. See `DataCoordinate.standardize` 

497 parameters. 

498 

499 Returns 

500 ------- 

501 datasetType : `DatasetType` 

502 A `DatasetType` instance extracted from ``datasetRefOrType``. 

503 dataId : `dict` or `DataId`, optional 

504 Argument that can be used (along with ``kwargs``) to construct a 

505 `DataId`. 

506 

507 Notes 

508 ----- 

509 Butler APIs that conceptually need a DatasetRef also allow passing a 

510 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

511 keyword arguments that can be used to construct one) separately. This 

512 method accepts those arguments and always returns a true `DatasetType` 

513 and a `DataId` or `dict`. 

514 

515 Standardization of `dict` vs `DataId` is best handled by passing the 

516 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

517 generally similarly flexible. 

518 """ 

519 externalDatasetType: Optional[DatasetType] = None 

520 internalDatasetType: Optional[DatasetType] = None 

521 if isinstance(datasetRefOrType, DatasetRef): 

522 if dataId is not None or kwargs: 

523 raise ValueError("DatasetRef given, cannot use dataId as well") 

524 externalDatasetType = datasetRefOrType.datasetType 

525 dataId = datasetRefOrType.dataId 

526 else: 

527 # Don't check whether DataId is provided, because Registry APIs 

528 # can usually construct a better error message when it wasn't. 

529 if isinstance(datasetRefOrType, DatasetType): 

530 externalDatasetType = datasetRefOrType 

531 else: 

532 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

533 

534 # Check that they are self-consistent 

535 if externalDatasetType is not None: 

536 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

537 if externalDatasetType != internalDatasetType: 

538 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

539 f"registry definition ({internalDatasetType})") 

540 

541 assert internalDatasetType is not None 

542 return internalDatasetType, dataId 

543 

544 def _rewrite_data_id(self, dataId: Optional[DataId], datasetType: DatasetType, 

545 **kwargs: Any) -> Tuple[Optional[DataId], Dict[str, Any]]: 

546 """Rewrite a data ID taking into account dimension records. 

547 

548 Take a Data ID and keyword args and rewrite it if necessary to 

549 allow the user to specify dimension records rather than dimension 

550 primary values. 

551 

552 This allows a user to include a dataId dict with keys of 

553 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

554 the integer exposure ID. It also allows a string to be given 

555 for a dimension value rather than the integer ID if that is more 

556 convenient. For example, rather than having to specifyin the 

557 detector with ``detector.full_name``, a string given for ``detector`` 

558 will be interpreted as the full name and converted to the integer 

559 value. 

560 

561 Keyword arguments can also use strings for dimensions like detector 

562 and exposure but python does not allow them to include ``.`` and 

563 so the ``exposure.day_obs`` syntax can not be used in a keyword 

564 argument. 

565 

566 Parameters 

567 ---------- 

568 dataId : `dict` or `DataCoordinate` 

569 A `dict` of `Dimension` link name, value pairs that will label the 

570 `DatasetRef` within a Collection. 

571 datasetType : `DatasetType` 

572 The dataset type associated with this dataId. Required to 

573 determine the relevant dimensions. 

574 **kwargs 

575 Additional keyword arguments used to augment or construct a 

576 `DataId`. See `DataId` parameters. 

577 

578 Returns 

579 ------- 

580 dataId : `dict` or `DataCoordinate` 

581 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

582 no keyword arguments, the orginal dataId will be returned 

583 unchanged. 

584 **kwargs : `dict` 

585 Any unused keyword arguments. 

586 """ 

587 # Do nothing if we have a standalone DataCoordinate. 

588 if isinstance(dataId, DataCoordinate) and not kwargs: 

589 return dataId, kwargs 

590 

591 # Process dimension records that are using record information 

592 # rather than ids 

593 newDataId: Dict[str, DataIdValue] = {} 

594 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

595 

596 # if all the dataId comes from keyword parameters we do not need 

597 # to do anything here because they can't be of the form 

598 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

599 if dataId: 

600 for k, v in dataId.items(): 

601 # If we have a Dimension we do not need to do anything 

602 # because it cannot be a compound key. 

603 if isinstance(k, str) and "." in k: 

604 # Someone is using a more human-readable dataId 

605 dimensionName, record = k.split(".", 1) 

606 byRecord[dimensionName][record] = v 

607 elif isinstance(k, Dimension): 

608 newDataId[k.name] = v 

609 else: 

610 newDataId[k] = v 

611 

612 # Go through the updated dataId and check the type in case someone is 

613 # using an alternate key. We have already filtered out the compound 

614 # keys dimensions.record format. 

615 not_dimensions = {} 

616 

617 # Will need to look in the dataId and the keyword arguments 

618 # and will remove them if they need to be fixed or are unrecognized. 

619 for dataIdDict in (newDataId, kwargs): 

620 # Use a list so we can adjust the dict safely in the loop 

621 for dimensionName in list(dataIdDict): 

622 value = dataIdDict[dimensionName] 

623 try: 

624 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

625 except KeyError: 

626 # This is not a real dimension 

627 not_dimensions[dimensionName] = value 

628 del dataIdDict[dimensionName] 

629 continue 

630 

631 # Convert an integral type to an explicit int to simplify 

632 # comparisons here 

633 if isinstance(value, numbers.Integral): 

634 value = int(value) 

635 

636 if not isinstance(value, dimension.primaryKey.getPythonType()): 

637 for alternate in dimension.alternateKeys: 

638 if isinstance(value, alternate.getPythonType()): 

639 byRecord[dimensionName][alternate.name] = value 

640 del dataIdDict[dimensionName] 

641 log.debug("Converting dimension %s to %s.%s=%s", 

642 dimensionName, dimensionName, alternate.name, value) 

643 break 

644 else: 

645 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

646 "Could not find matching alternative (primary key has type %s) " 

647 "so attempting to use as-is.", 

648 value, dimensionName, dimension.primaryKey.getPythonType()) 

649 

650 # If we have some unrecognized dimensions we have to try to connect 

651 # them to records in other dimensions. This is made more complicated 

652 # by some dimensions having records with clashing names. A mitigation 

653 # is that we can tell by this point which dimensions are missing 

654 # for the DatasetType but this does not work for calibrations 

655 # where additional dimensions can be used to constrain the temporal 

656 # axis. 

657 if not_dimensions: 

658 # Calculate missing dimensions 

659 provided = set(newDataId) | set(kwargs) | set(byRecord) 

660 missingDimensions = datasetType.dimensions.names - provided 

661 

662 # For calibrations we may well be needing temporal dimensions 

663 # so rather than always including all dimensions in the scan 

664 # restrict things a little. It is still possible for there 

665 # to be confusion over day_obs in visit vs exposure for example. 

666 # If we are not searching calibration collections things may 

667 # fail but they are going to fail anyway because of the 

668 # ambiguousness of the dataId... 

669 candidateDimensions: Set[str] = set() 

670 candidateDimensions.update(missingDimensions) 

671 if datasetType.isCalibration(): 

672 for dim in self.registry.dimensions.getStaticDimensions(): 

673 if dim.temporal: 

674 candidateDimensions.add(str(dim)) 

675 

676 # Look up table for the first association with a dimension 

677 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

678 

679 # Keep track of whether an item is associated with multiple 

680 # dimensions. 

681 counter: Counter[str] = Counter() 

682 assigned: Dict[str, Set[str]] = defaultdict(set) 

683 

684 # Go through the missing dimensions and associate the 

685 # given names with records within those dimensions 

686 for dimensionName in candidateDimensions: 

687 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

688 fields = dimension.metadata.names | dimension.uniqueKeys.names 

689 for field in not_dimensions: 

690 if field in fields: 

691 guessedAssociation[dimensionName][field] = not_dimensions[field] 

692 counter[dimensionName] += 1 

693 assigned[field].add(dimensionName) 

694 

695 # There is a chance we have allocated a single dataId item 

696 # to multiple dimensions. Need to decide which should be retained. 

697 # For now assume that the most popular alternative wins. 

698 # This means that day_obs with seq_num will result in 

699 # exposure.day_obs and not visit.day_obs 

700 # Also prefer an explicitly missing dimension over an inferred 

701 # temporal dimension. 

702 for fieldName, assignedDimensions in assigned.items(): 

703 if len(assignedDimensions) > 1: 

704 # Pick the most popular (preferring mandatory dimensions) 

705 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

706 if requiredButMissing: 

707 candidateDimensions = requiredButMissing 

708 else: 

709 candidateDimensions = assignedDimensions 

710 

711 # Select the relevant items and get a new restricted 

712 # counter. 

713 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

714 duplicatesCounter: Counter[str] = Counter() 

715 duplicatesCounter.update(theseCounts) 

716 

717 # Choose the most common. If they are equally common 

718 # we will pick the one that was found first. 

719 # Returns a list of tuples 

720 selected = duplicatesCounter.most_common(1)[0][0] 

721 

722 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

723 " Removed ambiguity by choosing dimension %s.", 

724 fieldName, ", ".join(assignedDimensions), selected) 

725 

726 for candidateDimension in assignedDimensions: 

727 if candidateDimension != selected: 

728 del guessedAssociation[candidateDimension][fieldName] 

729 

730 # Update the record look up dict with the new associations 

731 for dimensionName, values in guessedAssociation.items(): 

732 if values: # A dict might now be empty 

733 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

734 dimensionName, values) 

735 byRecord[dimensionName].update(values) 

736 

737 if byRecord: 

738 # Some record specifiers were found so we need to convert 

739 # them to the Id form 

740 for dimensionName, values in byRecord.items(): 

741 if dimensionName in newDataId: 

742 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

743 " general record specifiers for it of %s. Ignoring record information.", 

744 dimensionName, newDataId[dimensionName], str(values)) 

745 continue 

746 

747 # Build up a WHERE expression 

748 bind = {k: v for k, v in values.items()} 

749 where = " AND ".join(f"{dimensionName}.{k} = {k}" 

750 for k in bind) 

751 

752 # Hopefully we get a single record that matches 

753 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

754 where=where, bind=bind, **kwargs)) 

755 

756 if len(records) != 1: 

757 if len(records) > 1: 

758 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

759 for r in records: 

760 log.debug("- %s", str(r)) 

761 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

762 f" uniquely constrained to a single dataset by {values}." 

763 f" Got {len(records)} results.") 

764 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

765 f" records when constrained by {values}") 

766 

767 # Get the primary key from the real dimension object 

768 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

769 if not isinstance(dimension, Dimension): 

770 raise RuntimeError( 

771 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

772 ) 

773 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

774 

775 # We have modified the dataId so need to switch to it 

776 dataId = newDataId 

777 

778 return dataId, kwargs 

779 

780 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

781 dataId: Optional[DataId] = None, *, 

782 collections: Any = None, 

783 allowUnresolved: bool = False, 

784 **kwargs: Any) -> DatasetRef: 

785 """Shared logic for methods that start with a search for a dataset in 

786 the registry. 

787 

788 Parameters 

789 ---------- 

790 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

791 When `DatasetRef` the `dataId` should be `None`. 

792 Otherwise the `DatasetType` or name thereof. 

793 dataId : `dict` or `DataCoordinate`, optional 

794 A `dict` of `Dimension` link name, value pairs that label the 

795 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

796 should be provided as the first argument. 

797 collections : Any, optional 

798 Collections to be searched, overriding ``self.collections``. 

799 Can be any of the types supported by the ``collections`` argument 

800 to butler construction. 

801 allowUnresolved : `bool`, optional 

802 If `True`, return an unresolved `DatasetRef` if finding a resolved 

803 one in the `Registry` fails. Defaults to `False`. 

804 **kwargs 

805 Additional keyword arguments used to augment or construct a 

806 `DataId`. See `DataId` parameters. 

807 

808 Returns 

809 ------- 

810 ref : `DatasetRef` 

811 A reference to the dataset identified by the given arguments. 

812 

813 Raises 

814 ------ 

815 LookupError 

816 Raised if no matching dataset exists in the `Registry` (and 

817 ``allowUnresolved is False``). 

818 ValueError 

819 Raised if a resolved `DatasetRef` was passed as an input, but it 

820 differs from the one found in the registry. 

821 TypeError 

822 Raised if no collections were provided. 

823 """ 

824 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

825 if isinstance(datasetRefOrType, DatasetRef): 

826 idNumber = datasetRefOrType.id 

827 else: 

828 idNumber = None 

829 timespan: Optional[Timespan] = None 

830 

831 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

832 

833 if datasetType.isCalibration(): 

834 # Because this is a calibration dataset, first try to make a 

835 # standardize the data ID without restricting the dimensions to 

836 # those of the dataset type requested, because there may be extra 

837 # dimensions that provide temporal information for a validity-range 

838 # lookup. 

839 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

840 defaults=self.registry.defaults.dataId, **kwargs) 

841 if dataId.graph.temporal: 

842 dataId = self.registry.expandDataId(dataId) 

843 timespan = dataId.timespan 

844 else: 

845 # Standardize the data ID to just the dimensions of the dataset 

846 # type instead of letting registry.findDataset do it, so we get the 

847 # result even if no dataset is found. 

848 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

849 defaults=self.registry.defaults.dataId, **kwargs) 

850 # Always lookup the DatasetRef, even if one is given, to ensure it is 

851 # present in the current collection. 

852 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

853 if ref is None: 

854 if allowUnresolved: 

855 return DatasetRef(datasetType, dataId) 

856 else: 

857 if collections is None: 

858 collections = self.registry.defaults.collections 

859 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

860 f"could not be found in collections {collections}.") 

861 if idNumber is not None and idNumber != ref.id: 

862 if collections is None: 

863 collections = self.registry.defaults.collections 

864 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

865 f"id ({ref.id}) in registry in collections {collections}.") 

866 return ref 

867 

868 @transactional 

869 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

870 dataId: Optional[DataId] = None, *, 

871 run: Optional[str] = None, 

872 **kwargs: Any) -> DatasetRef: 

873 """Store and register a dataset. 

874 

875 Parameters 

876 ---------- 

877 obj : `object` 

878 The dataset. 

879 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

880 When `DatasetRef` is provided, ``dataId`` should be `None`. 

881 Otherwise the `DatasetType` or name thereof. 

882 dataId : `dict` or `DataCoordinate` 

883 A `dict` of `Dimension` link name, value pairs that label the 

884 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

885 should be provided as the second argument. 

886 run : `str`, optional 

887 The name of the run the dataset should be added to, overriding 

888 ``self.run``. 

889 **kwargs 

890 Additional keyword arguments used to augment or construct a 

891 `DataCoordinate`. See `DataCoordinate.standardize` 

892 parameters. 

893 

894 Returns 

895 ------- 

896 ref : `DatasetRef` 

897 A reference to the stored dataset, updated with the correct id if 

898 given. 

899 

900 Raises 

901 ------ 

902 TypeError 

903 Raised if the butler is read-only or if no run has been provided. 

904 """ 

905 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

906 if not self.isWriteable(): 

907 raise TypeError("Butler is read-only.") 

908 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

909 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

910 raise ValueError("DatasetRef must not be in registry, must have None id") 

911 

912 # Handle dimension records in dataId 

913 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

914 

915 # Add Registry Dataset entry. 

916 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

917 

918 # For an execution butler the datasets will be pre-defined. 

919 # If the butler is configured that way datasets should only be inserted 

920 # if they do not already exist in registry. Trying and catching 

921 # ConflictingDefinitionError will not work because the transaction 

922 # will be corrupted. Instead, in this mode always check first. 

923 ref = None 

924 ref_is_predefined = False 

925 if self._allow_put_of_predefined_dataset: 

926 # Get the matching ref for this run. 

927 ref = self.registry.findDataset(datasetType, collections=run, 

928 dataId=dataId) 

929 

930 if ref: 

931 # Must be expanded form for datastore templating 

932 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

933 ref = ref.expanded(dataId) 

934 ref_is_predefined = True 

935 

936 if not ref: 

937 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

938 

939 # If the ref is predefined it is possible that the datastore also 

940 # has the record. Asking datastore to put it again will result in 

941 # the artifact being recreated, overwriting previous, then will cause 

942 # a failure in writing the record which will cause the artifact 

943 # to be removed. Much safer to ask first before attempting to 

944 # overwrite. Race conditions should not be an issue for the 

945 # execution butler environment. 

946 if ref_is_predefined: 

947 if self.datastore.knows(ref): 

948 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

949 

950 self.datastore.put(obj, ref) 

951 

952 return ref 

953 

954 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

955 """Retrieve a stored dataset. 

956 

957 Unlike `Butler.get`, this method allows datasets outside the Butler's 

958 collection to be read as long as the `DatasetRef` that identifies them 

959 can be obtained separately. 

960 

961 Parameters 

962 ---------- 

963 ref : `DatasetRef` 

964 Resolved reference to an already stored dataset. 

965 parameters : `dict` 

966 Additional StorageClass-defined options to control reading, 

967 typically used to efficiently read only a subset of the dataset. 

968 

969 Returns 

970 ------- 

971 obj : `object` 

972 The dataset. 

973 """ 

974 return self.datastore.get(ref, parameters=parameters) 

975 

976 def getDirectDeferred(self, ref: DatasetRef, *, 

977 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

978 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

979 from a resolved `DatasetRef`. 

980 

981 Parameters 

982 ---------- 

983 ref : `DatasetRef` 

984 Resolved reference to an already stored dataset. 

985 parameters : `dict` 

986 Additional StorageClass-defined options to control reading, 

987 typically used to efficiently read only a subset of the dataset. 

988 

989 Returns 

990 ------- 

991 obj : `DeferredDatasetHandle` 

992 A handle which can be used to retrieve a dataset at a later time. 

993 

994 Raises 

995 ------ 

996 AmbiguousDatasetError 

997 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

998 """ 

999 if ref.id is None: 

1000 raise AmbiguousDatasetError( 

1001 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1002 ) 

1003 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1004 

1005 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1006 dataId: Optional[DataId] = None, *, 

1007 parameters: Union[dict, None] = None, 

1008 collections: Any = None, 

1009 **kwargs: Any) -> DeferredDatasetHandle: 

1010 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1011 after an immediate registry lookup. 

1012 

1013 Parameters 

1014 ---------- 

1015 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1016 When `DatasetRef` the `dataId` should be `None`. 

1017 Otherwise the `DatasetType` or name thereof. 

1018 dataId : `dict` or `DataCoordinate`, optional 

1019 A `dict` of `Dimension` link name, value pairs that label the 

1020 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1021 should be provided as the first argument. 

1022 parameters : `dict` 

1023 Additional StorageClass-defined options to control reading, 

1024 typically used to efficiently read only a subset of the dataset. 

1025 collections : Any, optional 

1026 Collections to be searched, overriding ``self.collections``. 

1027 Can be any of the types supported by the ``collections`` argument 

1028 to butler construction. 

1029 **kwargs 

1030 Additional keyword arguments used to augment or construct a 

1031 `DataId`. See `DataId` parameters. 

1032 

1033 Returns 

1034 ------- 

1035 obj : `DeferredDatasetHandle` 

1036 A handle which can be used to retrieve a dataset at a later time. 

1037 

1038 Raises 

1039 ------ 

1040 LookupError 

1041 Raised if no matching dataset exists in the `Registry` (and 

1042 ``allowUnresolved is False``). 

1043 ValueError 

1044 Raised if a resolved `DatasetRef` was passed as an input, but it 

1045 differs from the one found in the registry. 

1046 TypeError 

1047 Raised if no collections were provided. 

1048 """ 

1049 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1050 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

1051 

1052 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1053 dataId: Optional[DataId] = None, *, 

1054 parameters: Optional[Dict[str, Any]] = None, 

1055 collections: Any = None, 

1056 **kwargs: Any) -> Any: 

1057 """Retrieve a stored dataset. 

1058 

1059 Parameters 

1060 ---------- 

1061 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1062 When `DatasetRef` the `dataId` should be `None`. 

1063 Otherwise the `DatasetType` or name thereof. 

1064 dataId : `dict` or `DataCoordinate` 

1065 A `dict` of `Dimension` link name, value pairs that label the 

1066 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1067 should be provided as the first argument. 

1068 parameters : `dict` 

1069 Additional StorageClass-defined options to control reading, 

1070 typically used to efficiently read only a subset of the dataset. 

1071 collections : Any, optional 

1072 Collections to be searched, overriding ``self.collections``. 

1073 Can be any of the types supported by the ``collections`` argument 

1074 to butler construction. 

1075 **kwargs 

1076 Additional keyword arguments used to augment or construct a 

1077 `DataCoordinate`. See `DataCoordinate.standardize` 

1078 parameters. 

1079 

1080 Returns 

1081 ------- 

1082 obj : `object` 

1083 The dataset. 

1084 

1085 Raises 

1086 ------ 

1087 ValueError 

1088 Raised if a resolved `DatasetRef` was passed as an input, but it 

1089 differs from the one found in the registry. 

1090 LookupError 

1091 Raised if no matching dataset exists in the `Registry`. 

1092 TypeError 

1093 Raised if no collections were provided. 

1094 

1095 Notes 

1096 ----- 

1097 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1098 this method requires that the given data ID include temporal dimensions 

1099 beyond the dimensions of the dataset type itself, in order to find the 

1100 dataset with the appropriate validity range. For example, a "bias" 

1101 dataset with native dimensions ``{instrument, detector}`` could be 

1102 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1103 ``exposure`` is a temporal dimension. 

1104 """ 

1105 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1106 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1107 return self.getDirect(ref, parameters=parameters) 

1108 

1109 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1110 dataId: Optional[DataId] = None, *, 

1111 predict: bool = False, 

1112 collections: Any = None, 

1113 run: Optional[str] = None, 

1114 **kwargs: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1115 """Returns the URIs associated with the dataset. 

1116 

1117 Parameters 

1118 ---------- 

1119 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1120 When `DatasetRef` the `dataId` should be `None`. 

1121 Otherwise the `DatasetType` or name thereof. 

1122 dataId : `dict` or `DataCoordinate` 

1123 A `dict` of `Dimension` link name, value pairs that label the 

1124 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1125 should be provided as the first argument. 

1126 predict : `bool` 

1127 If `True`, allow URIs to be returned of datasets that have not 

1128 been written. 

1129 collections : Any, optional 

1130 Collections to be searched, overriding ``self.collections``. 

1131 Can be any of the types supported by the ``collections`` argument 

1132 to butler construction. 

1133 run : `str`, optional 

1134 Run to use for predictions, overriding ``self.run``. 

1135 **kwargs 

1136 Additional keyword arguments used to augment or construct a 

1137 `DataCoordinate`. See `DataCoordinate.standardize` 

1138 parameters. 

1139 

1140 Returns 

1141 ------- 

1142 primary : `ButlerURI` 

1143 The URI to the primary artifact associated with this dataset. 

1144 If the dataset was disassembled within the datastore this 

1145 may be `None`. 

1146 components : `dict` 

1147 URIs to any components associated with the dataset artifact. 

1148 Can be empty if there are no components. 

1149 """ 

1150 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1151 collections=collections, **kwargs) 

1152 if ref.id is None: # only possible if predict is True 

1153 if run is None: 

1154 run = self.run 

1155 if run is None: 

1156 raise TypeError("Cannot predict location with run=None.") 

1157 # Lie about ID, because we can't guess it, and only 

1158 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1159 ref = ref.resolved(id=0, run=run) 

1160 return self.datastore.getURIs(ref, predict) 

1161 

1162 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1163 dataId: Optional[DataId] = None, *, 

1164 predict: bool = False, 

1165 collections: Any = None, 

1166 run: Optional[str] = None, 

1167 **kwargs: Any) -> ButlerURI: 

1168 """Return the URI to the Dataset. 

1169 

1170 Parameters 

1171 ---------- 

1172 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1173 When `DatasetRef` the `dataId` should be `None`. 

1174 Otherwise the `DatasetType` or name thereof. 

1175 dataId : `dict` or `DataCoordinate` 

1176 A `dict` of `Dimension` link name, value pairs that label the 

1177 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1178 should be provided as the first argument. 

1179 predict : `bool` 

1180 If `True`, allow URIs to be returned of datasets that have not 

1181 been written. 

1182 collections : Any, optional 

1183 Collections to be searched, overriding ``self.collections``. 

1184 Can be any of the types supported by the ``collections`` argument 

1185 to butler construction. 

1186 run : `str`, optional 

1187 Run to use for predictions, overriding ``self.run``. 

1188 **kwargs 

1189 Additional keyword arguments used to augment or construct a 

1190 `DataCoordinate`. See `DataCoordinate.standardize` 

1191 parameters. 

1192 

1193 Returns 

1194 ------- 

1195 uri : `ButlerURI` 

1196 URI pointing to the Dataset within the datastore. If the 

1197 Dataset does not exist in the datastore, and if ``predict`` is 

1198 `True`, the URI will be a prediction and will include a URI 

1199 fragment "#predicted". 

1200 If the datastore does not have entities that relate well 

1201 to the concept of a URI the returned URI string will be 

1202 descriptive. The returned URI is not guaranteed to be obtainable. 

1203 

1204 Raises 

1205 ------ 

1206 LookupError 

1207 A URI has been requested for a dataset that does not exist and 

1208 guessing is not allowed. 

1209 ValueError 

1210 Raised if a resolved `DatasetRef` was passed as an input, but it 

1211 differs from the one found in the registry. 

1212 TypeError 

1213 Raised if no collections were provided. 

1214 RuntimeError 

1215 Raised if a URI is requested for a dataset that consists of 

1216 multiple artifacts. 

1217 """ 

1218 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1219 collections=collections, run=run, **kwargs) 

1220 

1221 if primary is None or components: 

1222 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1223 "Use Butler.getURIs() instead.") 

1224 return primary 

1225 

1226 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1227 destination: Union[str, ButlerURI], transfer: str = "auto", 

1228 preserve_path: bool = True, 

1229 overwrite: bool = False) -> List[ButlerURI]: 

1230 """Retrieve the artifacts associated with the supplied refs. 

1231 

1232 Parameters 

1233 ---------- 

1234 refs : iterable of `DatasetRef` 

1235 The datasets for which artifacts are to be retrieved. 

1236 A single ref can result in multiple artifacts. The refs must 

1237 be resolved. 

1238 destination : `ButlerURI` or `str` 

1239 Location to write the artifacts. 

1240 transfer : `str`, optional 

1241 Method to use to transfer the artifacts. Must be one of the options 

1242 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1243 preserve_path : `bool`, optional 

1244 If `True` the full path of the artifact within the datastore 

1245 is preserved. If `False` the final file component of the path 

1246 is used. 

1247 overwrite : `bool`, optional 

1248 If `True` allow transfers to overwrite existing files at the 

1249 destination. 

1250 

1251 Returns 

1252 ------- 

1253 targets : `list` of `ButlerURI` 

1254 URIs of file artifacts in destination location. Order is not 

1255 preserved. 

1256 

1257 Notes 

1258 ----- 

1259 For non-file datastores the artifacts written to the destination 

1260 may not match the representation inside the datastore. For example 

1261 a hierarchical data structure in a NoSQL database may well be stored 

1262 as a JSON file. 

1263 """ 

1264 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer, 

1265 preserve_path=preserve_path, overwrite=overwrite) 

1266 

1267 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1268 dataId: Optional[DataId] = None, *, 

1269 collections: Any = None, 

1270 **kwargs: Any) -> bool: 

1271 """Return True if the Dataset is actually present in the Datastore. 

1272 

1273 Parameters 

1274 ---------- 

1275 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1276 When `DatasetRef` the `dataId` should be `None`. 

1277 Otherwise the `DatasetType` or name thereof. 

1278 dataId : `dict` or `DataCoordinate` 

1279 A `dict` of `Dimension` link name, value pairs that label the 

1280 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1281 should be provided as the first argument. 

1282 collections : Any, optional 

1283 Collections to be searched, overriding ``self.collections``. 

1284 Can be any of the types supported by the ``collections`` argument 

1285 to butler construction. 

1286 **kwargs 

1287 Additional keyword arguments used to augment or construct a 

1288 `DataCoordinate`. See `DataCoordinate.standardize` 

1289 parameters. 

1290 

1291 Raises 

1292 ------ 

1293 LookupError 

1294 Raised if the dataset is not even present in the Registry. 

1295 ValueError 

1296 Raised if a resolved `DatasetRef` was passed as an input, but it 

1297 differs from the one found in the registry. 

1298 TypeError 

1299 Raised if no collections were provided. 

1300 """ 

1301 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1302 return self.datastore.exists(ref) 

1303 

1304 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1305 """Remove one or more `~CollectionType.RUN` collections and the 

1306 datasets within them. 

1307 

1308 Parameters 

1309 ---------- 

1310 names : `Iterable` [ `str` ] 

1311 The names of the collections to remove. 

1312 unstore : `bool`, optional 

1313 If `True` (default), delete datasets from all datastores in which 

1314 they are present, and attempt to rollback the registry deletions if 

1315 datastore deletions fail (which may not always be possible). If 

1316 `False`, datastore records for these datasets are still removed, 

1317 but any artifacts (e.g. files) will not be. 

1318 

1319 Raises 

1320 ------ 

1321 TypeError 

1322 Raised if one or more collections are not of type 

1323 `~CollectionType.RUN`. 

1324 """ 

1325 if not self.isWriteable(): 

1326 raise TypeError("Butler is read-only.") 

1327 names = list(names) 

1328 refs: List[DatasetRef] = [] 

1329 for name in names: 

1330 collectionType = self.registry.getCollectionType(name) 

1331 if collectionType is not CollectionType.RUN: 

1332 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1333 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1334 with self.registry.transaction(): 

1335 if unstore: 

1336 self.datastore.trash(refs) 

1337 else: 

1338 self.datastore.forget(refs) 

1339 for name in names: 

1340 self.registry.removeCollection(name) 

1341 if unstore: 

1342 # Point of no return for removing artifacts 

1343 self.datastore.emptyTrash() 

1344 

1345 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False, 

1346 unlink: Optional[List[str]] = None) -> None: 

1347 """Remove a collection and possibly prune datasets within it. 

1348 

1349 Parameters 

1350 ---------- 

1351 name : `str` 

1352 Name of the collection to remove. If this is a 

1353 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1354 datasets within the collection are not modified unless ``unstore`` 

1355 is `True`. If this is a `~CollectionType.RUN` collection, 

1356 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1357 are fully removed from the data repository. 

1358 purge : `bool`, optional 

1359 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1360 fully removing datasets within them. Requires ``unstore=True`` as 

1361 well as an added precaution against accidental deletion. Must be 

1362 `False` (default) if the collection is not a ``RUN``. 

1363 unstore: `bool`, optional 

1364 If `True`, remove all datasets in the collection from all 

1365 datastores in which they appear. 

1366 unlink: `list` [`str`], optional 

1367 Before removing the given `collection` unlink it from from these 

1368 parent collections. 

1369 

1370 Raises 

1371 ------ 

1372 TypeError 

1373 Raised if the butler is read-only or arguments are mutually 

1374 inconsistent. 

1375 """ 

1376 # See pruneDatasets comments for more information about the logic here; 

1377 # the cases are almost the same, but here we can rely on Registry to 

1378 # take care everything but Datastore deletion when we remove the 

1379 # collection. 

1380 if not self.isWriteable(): 

1381 raise TypeError("Butler is read-only.") 

1382 collectionType = self.registry.getCollectionType(name) 

1383 if purge and not unstore: 

1384 raise PurgeWithoutUnstorePruneCollectionsError() 

1385 if collectionType is CollectionType.RUN and not purge: 

1386 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1387 if collectionType is not CollectionType.RUN and purge: 

1388 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1389 

1390 def remove(child: str, parent: str) -> None: 

1391 """Remove a child collection from a parent collection.""" 

1392 # Remove child from parent. 

1393 chain = list(self.registry.getCollectionChain(parent)) 

1394 try: 

1395 chain.remove(name) 

1396 except ValueError as e: 

1397 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1398 self.registry.setCollectionChain(parent, chain) 

1399 

1400 with self.registry.transaction(): 

1401 if (unlink): 

1402 for parent in unlink: 

1403 remove(name, parent) 

1404 if unstore: 

1405 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1406 self.datastore.trash(refs) 

1407 self.registry.removeCollection(name) 

1408 

1409 if unstore: 

1410 # Point of no return for removing artifacts 

1411 self.datastore.emptyTrash() 

1412 

1413 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1414 disassociate: bool = True, 

1415 unstore: bool = False, 

1416 tags: Iterable[str] = (), 

1417 purge: bool = False, 

1418 run: Optional[str] = None) -> None: 

1419 """Remove one or more datasets from a collection and/or storage. 

1420 

1421 Parameters 

1422 ---------- 

1423 refs : `~collections.abc.Iterable` of `DatasetRef` 

1424 Datasets to prune. These must be "resolved" references (not just 

1425 a `DatasetType` and data ID). 

1426 disassociate : `bool`, optional 

1427 Disassociate pruned datasets from ``tags``, or from all collections 

1428 if ``purge=True``. 

1429 unstore : `bool`, optional 

1430 If `True` (`False` is default) remove these datasets from all 

1431 datastores known to this butler. Note that this will make it 

1432 impossible to retrieve these datasets even via other collections. 

1433 Datasets that are already not stored are ignored by this option. 

1434 tags : `Iterable` [ `str` ], optional 

1435 `~CollectionType.TAGGED` collections to disassociate the datasets 

1436 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1437 `True`. 

1438 purge : `bool`, optional 

1439 If `True` (`False` is default), completely remove the dataset from 

1440 the `Registry`. To prevent accidental deletions, ``purge`` may 

1441 only be `True` if all of the following conditions are met: 

1442 

1443 - All given datasets are in the given run. 

1444 - ``disassociate`` is `True`; 

1445 - ``unstore`` is `True`. 

1446 

1447 This mode may remove provenance information from datasets other 

1448 than those provided, and should be used with extreme care. 

1449 

1450 Raises 

1451 ------ 

1452 TypeError 

1453 Raised if the butler is read-only, if no collection was provided, 

1454 or the conditions for ``purge=True`` were not met. 

1455 """ 

1456 if not self.isWriteable(): 

1457 raise TypeError("Butler is read-only.") 

1458 if purge: 

1459 if not disassociate: 

1460 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1461 if not unstore: 

1462 raise TypeError("Cannot pass purge=True without unstore=True.") 

1463 elif disassociate: 

1464 tags = tuple(tags) 

1465 if not tags: 

1466 raise TypeError("No tags provided but disassociate=True.") 

1467 for tag in tags: 

1468 collectionType = self.registry.getCollectionType(tag) 

1469 if collectionType is not CollectionType.TAGGED: 

1470 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1471 f"of non-TAGGED type {collectionType.name}.") 

1472 # Transform possibly-single-pass iterable into something we can iterate 

1473 # over multiple times. 

1474 refs = list(refs) 

1475 # Pruning a component of a DatasetRef makes no sense since registry 

1476 # doesn't know about components and datastore might not store 

1477 # components in a separate file 

1478 for ref in refs: 

1479 if ref.datasetType.component(): 

1480 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1481 # We don't need an unreliable Datastore transaction for this, because 

1482 # we've been extra careful to ensure that Datastore.trash only involves 

1483 # mutating the Registry (it can _look_ at Datastore-specific things, 

1484 # but shouldn't change them), and hence all operations here are 

1485 # Registry operations. 

1486 with self.registry.transaction(): 

1487 if unstore: 

1488 self.datastore.trash(refs) 

1489 if purge: 

1490 self.registry.removeDatasets(refs) 

1491 elif disassociate: 

1492 assert tags, "Guaranteed by earlier logic in this function." 

1493 for tag in tags: 

1494 self.registry.disassociate(tag, refs) 

1495 # We've exited the Registry transaction, and apparently committed. 

1496 # (if there was an exception, everything rolled back, and it's as if 

1497 # nothing happened - and we never get here). 

1498 # Datastore artifacts are not yet gone, but they're clearly marked 

1499 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1500 # problems we can try again later, and if manual administrative 

1501 # intervention is required, it's pretty clear what that should entail: 

1502 # deleting everything on disk and in private Datastore tables that is 

1503 # in the dataset_location_trash table. 

1504 if unstore: 

1505 # Point of no return for removing artifacts 

1506 self.datastore.emptyTrash() 

1507 

1508 @transactional 

1509 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1510 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1511 ) -> None: 

1512 """Store and register one or more datasets that already exist on disk. 

1513 

1514 Parameters 

1515 ---------- 

1516 datasets : `FileDataset` 

1517 Each positional argument is a struct containing information about 

1518 a file to be ingested, including its URI (either absolute or 

1519 relative to the datastore root, if applicable), a `DatasetRef`, 

1520 and optionally a formatter class or its fully-qualified string 

1521 name. If a formatter is not provided, the formatter that would be 

1522 used for `put` is assumed. On successful return, all 

1523 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1524 attribute populated and all `FileDataset.formatter` attributes will 

1525 be set to the formatter class used. `FileDataset.path` attributes 

1526 may be modified to put paths in whatever the datastore considers a 

1527 standardized form. 

1528 transfer : `str`, optional 

1529 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1530 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1531 transfer the file. 

1532 run : `str`, optional 

1533 The name of the run ingested datasets should be added to, 

1534 overriding ``self.run``. 

1535 idGenerationMode : `DatasetIdGenEnum`, optional 

1536 Specifies option for generating dataset IDs. By default unique IDs 

1537 are generated for each inserted dataset. 

1538 

1539 Raises 

1540 ------ 

1541 TypeError 

1542 Raised if the butler is read-only or if no run was provided. 

1543 NotImplementedError 

1544 Raised if the `Datastore` does not support the given transfer mode. 

1545 DatasetTypeNotSupportedError 

1546 Raised if one or more files to be ingested have a dataset type that 

1547 is not supported by the `Datastore`.. 

1548 FileNotFoundError 

1549 Raised if one of the given files does not exist. 

1550 FileExistsError 

1551 Raised if transfer is not `None` but the (internal) location the 

1552 file would be moved to is already occupied. 

1553 

1554 Notes 

1555 ----- 

1556 This operation is not fully exception safe: if a database operation 

1557 fails, the given `FileDataset` instances may be only partially updated. 

1558 

1559 It is atomic in terms of database operations (they will either all 

1560 succeed or all fail) providing the database engine implements 

1561 transactions correctly. It will attempt to be atomic in terms of 

1562 filesystem operations as well, but this cannot be implemented 

1563 rigorously for most datastores. 

1564 """ 

1565 if not self.isWriteable(): 

1566 raise TypeError("Butler is read-only.") 

1567 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1568 # Reorganize the inputs so they're grouped by DatasetType and then 

1569 # data ID. We also include a list of DatasetRefs for each FileDataset 

1570 # to hold the resolved DatasetRefs returned by the Registry, before 

1571 # it's safe to swap them into FileDataset.refs. 

1572 # Some type annotation aliases to make that clearer: 

1573 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1574 GroupedData = MutableMapping[DatasetType, GroupForType] 

1575 # The actual data structure: 

1576 groupedData: GroupedData = defaultdict(dict) 

1577 # And the nested loop that populates it: 

1578 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1579 # This list intentionally shared across the inner loop, since it's 

1580 # associated with `dataset`. 

1581 resolvedRefs: List[DatasetRef] = [] 

1582 

1583 # Somewhere to store pre-existing refs if we have an 

1584 # execution butler. 

1585 existingRefs: List[DatasetRef] = [] 

1586 

1587 for ref in dataset.refs: 

1588 if ref.dataId in groupedData[ref.datasetType]: 

1589 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same" 

1590 " DataId as other ingest dataset" 

1591 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1592 f" ({ref.dataId})") 

1593 if self._allow_put_of_predefined_dataset: 

1594 existing_ref = self.registry.findDataset(ref.datasetType, 

1595 dataId=ref.dataId, 

1596 collections=run) 

1597 if existing_ref: 

1598 if self.datastore.knows(existing_ref): 

1599 raise ConflictingDefinitionError(f"Dataset associated with path {dataset.path}" 

1600 f" already exists as {existing_ref}.") 

1601 # Store this ref elsewhere since it already exists 

1602 # and we do not want to remake it but we do want 

1603 # to store it in the datastore. 

1604 existingRefs.append(existing_ref) 

1605 

1606 # Nothing else to do until we have finished 

1607 # iterating. 

1608 continue 

1609 

1610 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1611 

1612 if existingRefs: 

1613 

1614 if len(dataset.refs) != len(existingRefs): 

1615 # Keeping track of partially pre-existing datasets is hard 

1616 # and should generally never happen. For now don't allow 

1617 # it. 

1618 raise ConflictingDefinitionError(f"For dataset {dataset.path} some dataIds already exist" 

1619 " in registry but others do not. This is not supported.") 

1620 

1621 # Attach the resolved refs if we found them. 

1622 dataset.refs = existingRefs 

1623 

1624 # Now we can bulk-insert into Registry for each DatasetType. 

1625 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(), 

1626 desc="Bulk-inserting datasets by type"): 

1627 refs = self.registry.insertDatasets( 

1628 datasetType, 

1629 dataIds=groupForType.keys(), 

1630 run=run, 

1631 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1632 idGenerationMode=idGenerationMode, 

1633 ) 

1634 # Append those resolved DatasetRefs to the new lists we set up for 

1635 # them. 

1636 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1637 resolvedRefs.append(ref) 

1638 

1639 # Go back to the original FileDatasets to replace their refs with the 

1640 # new resolved ones. 

1641 for groupForType in progress.iter_chunks(groupedData.values(), 

1642 desc="Reassociating resolved dataset refs with files"): 

1643 for dataset, resolvedRefs in groupForType.values(): 

1644 dataset.refs = resolvedRefs 

1645 

1646 # Bulk-insert everything into Datastore. 

1647 self.datastore.ingest(*datasets, transfer=transfer) 

1648 

1649 @contextlib.contextmanager 

1650 def export(self, *, directory: Optional[str] = None, 

1651 filename: Optional[str] = None, 

1652 format: Optional[str] = None, 

1653 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1654 """Export datasets from the repository represented by this `Butler`. 

1655 

1656 This method is a context manager that returns a helper object 

1657 (`RepoExportContext`) that is used to indicate what information from 

1658 the repository should be exported. 

1659 

1660 Parameters 

1661 ---------- 

1662 directory : `str`, optional 

1663 Directory dataset files should be written to if ``transfer`` is not 

1664 `None`. 

1665 filename : `str`, optional 

1666 Name for the file that will include database information associated 

1667 with the exported datasets. If this is not an absolute path and 

1668 ``directory`` is not `None`, it will be written to ``directory`` 

1669 instead of the current working directory. Defaults to 

1670 "export.{format}". 

1671 format : `str`, optional 

1672 File format for the database information file. If `None`, the 

1673 extension of ``filename`` will be used. 

1674 transfer : `str`, optional 

1675 Transfer mode passed to `Datastore.export`. 

1676 

1677 Raises 

1678 ------ 

1679 TypeError 

1680 Raised if the set of arguments passed is inconsistent. 

1681 

1682 Examples 

1683 -------- 

1684 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1685 methods are used to provide the iterables over data IDs and/or datasets 

1686 to be exported:: 

1687 

1688 with butler.export("exports.yaml") as export: 

1689 # Export all flats, but none of the dimension element rows 

1690 # (i.e. data ID information) associated with them. 

1691 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1692 elements=()) 

1693 # Export all datasets that start with "deepCoadd_" and all of 

1694 # their associated data ID information. 

1695 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1696 """ 

1697 if directory is None and transfer is not None: 

1698 raise TypeError("Cannot transfer without providing a directory.") 

1699 if transfer == "move": 

1700 raise TypeError("Transfer may not be 'move': export is read-only") 

1701 if format is None: 

1702 if filename is None: 

1703 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1704 else: 

1705 _, format = os.path.splitext(filename) 

1706 elif filename is None: 

1707 filename = f"export.{format}" 

1708 if directory is not None: 

1709 filename = os.path.join(directory, filename) 

1710 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"]) 

1711 with open(filename, 'w') as stream: 

1712 backend = BackendClass(stream) 

1713 try: 

1714 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1715 directory=directory, transfer=transfer) 

1716 yield helper 

1717 except BaseException: 

1718 raise 

1719 else: 

1720 helper._finish() 

1721 

1722 def import_(self, *, directory: Optional[str] = None, 

1723 filename: Union[str, TextIO, None] = None, 

1724 format: Optional[str] = None, 

1725 transfer: Optional[str] = None, 

1726 skip_dimensions: Optional[Set] = None, 

1727 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1728 reuseIds: bool = False) -> None: 

1729 """Import datasets into this repository that were exported from a 

1730 different butler repository via `~lsst.daf.butler.Butler.export`. 

1731 

1732 Parameters 

1733 ---------- 

1734 directory : `str`, optional 

1735 Directory containing dataset files to import from. If `None`, 

1736 ``filename`` and all dataset file paths specified therein must 

1737 be absolute. 

1738 filename : `str` or `TextIO`, optional 

1739 A stream or name of file that contains database information 

1740 associated with the exported datasets, typically generated by 

1741 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1742 is not an absolute path, does not exist in the current working 

1743 directory, and ``directory`` is not `None`, it is assumed to be in 

1744 ``directory``. Defaults to "export.{format}". 

1745 format : `str`, optional 

1746 File format for ``filename``. If `None`, the extension of 

1747 ``filename`` will be used. 

1748 transfer : `str`, optional 

1749 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1750 skip_dimensions : `set`, optional 

1751 Names of dimensions that should be skipped and not imported. 

1752 idGenerationMode : `DatasetIdGenEnum`, optional 

1753 Specifies option for generating dataset IDs when IDs are not 

1754 provided or their type does not match backend type. By default 

1755 unique IDs are generated for each inserted dataset. 

1756 reuseIds : `bool`, optional 

1757 If `True` then forces re-use of imported dataset IDs for integer 

1758 IDs which are normally generated as auto-incremented; exception 

1759 will be raised if imported IDs clash with existing ones. This 

1760 option has no effect on the use of globally-unique IDs which are 

1761 always re-used (or generated if integer IDs are being imported). 

1762 

1763 Raises 

1764 ------ 

1765 TypeError 

1766 Raised if the set of arguments passed is inconsistent, or if the 

1767 butler is read-only. 

1768 """ 

1769 if not self.isWriteable(): 

1770 raise TypeError("Butler is read-only.") 

1771 if format is None: 

1772 if filename is None: 

1773 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1774 else: 

1775 _, format = os.path.splitext(filename) # type: ignore 

1776 elif filename is None: 

1777 filename = f"export.{format}" 

1778 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1779 filename = os.path.join(directory, filename) 

1780 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"]) 

1781 

1782 def doImport(importStream: TextIO) -> None: 

1783 backend = BackendClass(importStream, self.registry) 

1784 backend.register() 

1785 with self.transaction(): 

1786 backend.load(self.datastore, directory=directory, transfer=transfer, 

1787 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode, 

1788 reuseIds=reuseIds) 

1789 

1790 if isinstance(filename, str): 

1791 with open(filename, "r") as stream: 

1792 doImport(stream) 

1793 else: 

1794 doImport(filename) 

1795 

1796 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef], 

1797 transfer: str = "auto", 

1798 id_gen_map: Dict[str, DatasetIdGenEnum] = None, 

1799 skip_missing: bool = True) -> List[DatasetRef]: 

1800 """Transfer datasets to this Butler from a run in another Butler. 

1801 

1802 Parameters 

1803 ---------- 

1804 source_butler : `Butler` 

1805 Butler from which the datasets are to be transferred. 

1806 source_refs : iterable of `DatasetRef` 

1807 Datasets defined in the source butler that should be transferred to 

1808 this butler. 

1809 transfer : `str`, optional 

1810 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

1811 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

1812 A mapping of dataset type to ID generation mode. Only used if 

1813 the source butler is using integer IDs. Should not be used 

1814 if this receiving butler uses integer IDs. Without this dataset 

1815 import always uses unique. 

1816 skip_missing : `bool` 

1817 If `True`, datasets with no datastore artifact associated with 

1818 them are not transferred. If `False` a registry entry will be 

1819 created even if no datastore record is created (and so will 

1820 look equivalent to the dataset being unstored). 

1821 

1822 Returns 

1823 ------- 

1824 refs : `list` of `DatasetRef` 

1825 The refs added to this Butler. 

1826 

1827 Notes 

1828 ----- 

1829 Requires that any dimension definitions are already present in the 

1830 receiving Butler. The datastore artifact has to exist for a transfer 

1831 to be made but non-existence is not an error. 

1832 

1833 Datasets that already exist in this run will be skipped. 

1834 

1835 The datasets are imported as part of a transaction, although 

1836 dataset types are registered before the transaction is started. 

1837 This means that it is possible for a dataset type to be registered 

1838 even though transfer has failed. 

1839 """ 

1840 if not self.isWriteable(): 

1841 raise TypeError("Butler is read-only.") 

1842 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1843 

1844 # Will iterate through the refs multiple times so need to convert 

1845 # to a list if this isn't a collection. 

1846 if not isinstance(source_refs, collections.abc.Collection): 

1847 source_refs = list(source_refs) 

1848 

1849 original_count = len(source_refs) 

1850 log.info("Transferring %d datasets into %s", original_count, str(self)) 

1851 

1852 if id_gen_map is None: 

1853 id_gen_map = {} 

1854 

1855 # In some situations the datastore artifact may be missing 

1856 # and we do not want that registry entry to be imported. 

1857 # Asking datastore is not sufficient, the records may have been 

1858 # purged, we have to ask for the (predicted) URI and check 

1859 # existence explicitly. Execution butler is set up exactly like 

1860 # this with no datastore records. 

1861 artifact_existence: Dict[ButlerURI, bool] = {} 

1862 if skip_missing: 

1863 dataset_existence = source_butler.datastore.mexists(source_refs, 

1864 artifact_existence=artifact_existence) 

1865 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1866 filtered_count = len(source_refs) 

1867 log.log(VERBOSE, "%d datasets removed because the artifact does not exist. Now have %d.", 

1868 original_count - filtered_count, filtered_count) 

1869 

1870 # Importing requires that we group the refs by dataset type and run 

1871 # before doing the import. 

1872 grouped_refs = defaultdict(list) 

1873 grouped_indices = defaultdict(list) 

1874 for i, ref in enumerate(source_refs): 

1875 grouped_refs[ref.datasetType, ref.run].append(ref) 

1876 grouped_indices[ref.datasetType, ref.run].append(i) 

1877 

1878 # Register any dataset types we need. This has to be done outside 

1879 # of a transaction and so will not be rolled back on failure. 

1880 for datasetType, _ in grouped_refs: 

1881 self.registry.registerDatasetType(datasetType) 

1882 

1883 # The returned refs should be identical for UUIDs. 

1884 # For now must also support integers and so need to retain the 

1885 # newly-created refs from this registry. 

1886 # Pre-size it so we can assign refs into the correct slots 

1887 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

1888 default_id_gen = DatasetIdGenEnum.UNIQUE 

1889 

1890 # Do all the importing in a single transaction. 

1891 with self.transaction(): 

1892 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(), 

1893 desc="Importing to registry" 

1894 " by run and dataset type"): 

1895 run_doc = source_butler.registry.getCollectionDocumentation(run) 

1896 self.registry.registerCollection(run, CollectionType.RUN, doc=run_doc) 

1897 

1898 id_generation_mode = default_id_gen 

1899 if isinstance(refs_to_import[0].id, int): 

1900 # ID generation mode might need to be overridden when 

1901 # targetting UUID 

1902 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

1903 

1904 n_refs = len(refs_to_import) 

1905 log.log(VERBOSE, "Importing %d ref%s of dataset type %s into run %s", 

1906 n_refs, "" if n_refs == 1 else "s", datasetType.name, run) 

1907 

1908 # No way to know if this butler's registry uses UUID. 

1909 # We have to trust the caller on this. If it fails they will 

1910 # have to change their approach. We can't catch the exception 

1911 # and retry with unique because that will mess up the 

1912 # transaction handling. We aren't allowed to ask the registry 

1913 # manager what type of ID it is using. 

1914 imported_refs = self.registry._importDatasets(refs_to_import, 

1915 idGenerationMode=id_generation_mode, 

1916 expand=False) 

1917 

1918 # Map them into the correct slots to match the initial order 

1919 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

1920 transferred_refs_tmp[i] = ref 

1921 

1922 # Mypy insists that we might have None in here so we have to make 

1923 # that explicit by assigning to a new variable and filtering out 

1924 # something that won't be there. 

1925 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

1926 

1927 # Check consistency 

1928 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

1929 

1930 log.log(VERBOSE, "Imported %d datasets into destination butler", len(transferred_refs)) 

1931 

1932 # The transferred refs need to be reordered to match the original 

1933 # ordering given by the caller. Without this the datastore transfer 

1934 # will be broken. 

1935 

1936 # Ask the datastore to transfer. The datastore has to check that 

1937 # the source datastore is compatible with the target datastore. 

1938 self.datastore.transfer_from(source_butler.datastore, source_refs, 

1939 local_refs=transferred_refs, transfer=transfer, 

1940 artifact_existence=artifact_existence) 

1941 

1942 return transferred_refs 

1943 

1944 def validateConfiguration(self, logFailures: bool = False, 

1945 datasetTypeNames: Optional[Iterable[str]] = None, 

1946 ignore: Iterable[str] = None) -> None: 

1947 """Validate butler configuration. 

1948 

1949 Checks that each `DatasetType` can be stored in the `Datastore`. 

1950 

1951 Parameters 

1952 ---------- 

1953 logFailures : `bool`, optional 

1954 If `True`, output a log message for every validation error 

1955 detected. 

1956 datasetTypeNames : iterable of `str`, optional 

1957 The `DatasetType` names that should be checked. This allows 

1958 only a subset to be selected. 

1959 ignore : iterable of `str`, optional 

1960 Names of DatasetTypes to skip over. This can be used to skip 

1961 known problems. If a named `DatasetType` corresponds to a 

1962 composite, all components of that `DatasetType` will also be 

1963 ignored. 

1964 

1965 Raises 

1966 ------ 

1967 ButlerValidationError 

1968 Raised if there is some inconsistency with how this Butler 

1969 is configured. 

1970 """ 

1971 if datasetTypeNames: 

1972 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

1973 else: 

1974 datasetTypes = list(self.registry.queryDatasetTypes()) 

1975 

1976 # filter out anything from the ignore list 

1977 if ignore: 

1978 ignore = set(ignore) 

1979 datasetTypes = [e for e in datasetTypes 

1980 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

1981 else: 

1982 ignore = set() 

1983 

1984 # Find all the registered instruments 

1985 instruments = set( 

1986 record.name for record in self.registry.queryDimensionRecords("instrument") 

1987 ) 

1988 

1989 # For each datasetType that has an instrument dimension, create 

1990 # a DatasetRef for each defined instrument 

1991 datasetRefs = [] 

1992 

1993 for datasetType in datasetTypes: 

1994 if "instrument" in datasetType.dimensions: 

1995 for instrument in instruments: 

1996 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

1997 conform=False) 

1998 datasetRefs.append(datasetRef) 

1999 

2000 entities: List[Union[DatasetType, DatasetRef]] = [] 

2001 entities.extend(datasetTypes) 

2002 entities.extend(datasetRefs) 

2003 

2004 datastoreErrorStr = None 

2005 try: 

2006 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2007 except ValidationError as e: 

2008 datastoreErrorStr = str(e) 

2009 

2010 # Also check that the LookupKeys used by the datastores match 

2011 # registry and storage class definitions 

2012 keys = self.datastore.getLookupKeys() 

2013 

2014 failedNames = set() 

2015 failedDataId = set() 

2016 for key in keys: 

2017 if key.name is not None: 

2018 if key.name in ignore: 

2019 continue 

2020 

2021 # skip if specific datasetType names were requested and this 

2022 # name does not match 

2023 if datasetTypeNames and key.name not in datasetTypeNames: 

2024 continue 

2025 

2026 # See if it is a StorageClass or a DatasetType 

2027 if key.name in self.storageClasses: 

2028 pass 

2029 else: 

2030 try: 

2031 self.registry.getDatasetType(key.name) 

2032 except KeyError: 

2033 if logFailures: 

2034 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2035 failedNames.add(key) 

2036 else: 

2037 # Dimensions are checked for consistency when the Butler 

2038 # is created and rendezvoused with a universe. 

2039 pass 

2040 

2041 # Check that the instrument is a valid instrument 

2042 # Currently only support instrument so check for that 

2043 if key.dataId: 

2044 dataIdKeys = set(key.dataId) 

2045 if set(["instrument"]) != dataIdKeys: 

2046 if logFailures: 

2047 log.critical("Key '%s' has unsupported DataId override", key) 

2048 failedDataId.add(key) 

2049 elif key.dataId["instrument"] not in instruments: 

2050 if logFailures: 

2051 log.critical("Key '%s' has unknown instrument", key) 

2052 failedDataId.add(key) 

2053 

2054 messages = [] 

2055 

2056 if datastoreErrorStr: 

2057 messages.append(datastoreErrorStr) 

2058 

2059 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2060 (failedDataId, "Keys with bad DataId entries: ")): 

2061 if failed: 

2062 msg += ", ".join(str(k) for k in failed) 

2063 messages.append(msg) 

2064 

2065 if messages: 

2066 raise ValidationError(";\n".join(messages)) 

2067 

2068 @property 

2069 def collections(self) -> CollectionSearch: 

2070 """The collections to search by default, in order (`CollectionSearch`). 

2071 

2072 This is an alias for ``self.registry.defaults.collections``. It cannot 

2073 be set directly in isolation, but all defaults may be changed together 

2074 by assigning a new `RegistryDefaults` instance to 

2075 ``self.registry.defaults``. 

2076 """ 

2077 return self.registry.defaults.collections 

2078 

2079 @property 

2080 def run(self) -> Optional[str]: 

2081 """Name of the run this butler writes outputs to by default (`str` or 

2082 `None`). 

2083 

2084 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2085 directly in isolation, but all defaults may be changed together by 

2086 assigning a new `RegistryDefaults` instance to 

2087 ``self.registry.defaults``. 

2088 """ 

2089 return self.registry.defaults.run 

2090 

2091 registry: Registry 

2092 """The object that manages dataset metadata and relationships (`Registry`). 

2093 

2094 Most operations that don't involve reading or writing butler datasets are 

2095 accessible only via `Registry` methods. 

2096 """ 

2097 

2098 datastore: Datastore 

2099 """The object that manages actual dataset storage (`Datastore`). 

2100 

2101 Direct user access to the datastore should rarely be necessary; the primary 

2102 exception is the case where a `Datastore` implementation provides extra 

2103 functionality beyond what the base class defines. 

2104 """ 

2105 

2106 storageClasses: StorageClassFactory 

2107 """An object that maps known storage class names to objects that fully 

2108 describe them (`StorageClassFactory`). 

2109 """ 

2110 

2111 _allow_put_of_predefined_dataset: bool 

2112 """Allow a put to succeed even if there is already a registry entry for it 

2113 but not a datastore record. (`bool`)."""