Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36import collections.abc 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImport 

65from .core import ( 

66 AmbiguousDatasetError, 

67 ButlerURI, 

68 Config, 

69 ConfigSubset, 

70 DataCoordinate, 

71 DataId, 

72 DataIdValue, 

73 DatasetRef, 

74 DatasetType, 

75 Datastore, 

76 Dimension, 

77 DimensionConfig, 

78 FileDataset, 

79 Progress, 

80 StorageClassFactory, 

81 Timespan, 

82 ValidationError, 

83 VERBOSE, 

84) 

85from .core.repoRelocation import BUTLER_ROOT_TAG 

86from .core.utils import transactional, getClassOf 

87from ._deferredDatasetHandle import DeferredDatasetHandle 

88from ._butlerConfig import ButlerConfig 

89from .registry import ( 

90 Registry, 

91 RegistryConfig, 

92 RegistryDefaults, 

93 CollectionSearch, 

94 CollectionType, 

95 ConflictingDefinitionError, 

96 DatasetIdGenEnum, 

97) 

98from .transfers import RepoExportContext 

99 

100log = logging.getLogger(__name__) 

101 

102 

103class ButlerValidationError(ValidationError): 

104 """There is a problem with the Butler configuration.""" 

105 pass 

106 

107 

108class PruneCollectionsArgsError(TypeError): 

109 """Base class for errors relating to Butler.pruneCollections input 

110 arguments. 

111 """ 

112 pass 

113 

114 

115class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

116 """Raised when purge and unstore are both required to be True, and 

117 purge is True but unstore is False. 

118 """ 

119 

120 def __init__(self) -> None: 

121 super().__init__("Cannot pass purge=True without unstore=True.") 

122 

123 

124class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

125 """Raised when pruning a RUN collection but purge is False.""" 

126 

127 def __init__(self, collectionType: CollectionType): 

128 self.collectionType = collectionType 

129 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

130 

131 

132class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

133 """Raised when purge is True but is not supported for the given 

134 collection.""" 

135 

136 def __init__(self, collectionType: CollectionType): 

137 self.collectionType = collectionType 

138 super().__init__( 

139 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

140 

141 

142class Butler: 

143 """Main entry point for the data access system. 

144 

145 Parameters 

146 ---------- 

147 config : `ButlerConfig`, `Config` or `str`, optional. 

148 Configuration. Anything acceptable to the 

149 `ButlerConfig` constructor. If a directory path 

150 is given the configuration will be read from a ``butler.yaml`` file in 

151 that location. If `None` is given default values will be used. 

152 butler : `Butler`, optional. 

153 If provided, construct a new Butler that uses the same registry and 

154 datastore as the given one, but with the given collection and run. 

155 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

156 arguments. 

157 collections : `str` or `Iterable` [ `str` ], optional 

158 An expression specifying the collections to be searched (in order) when 

159 reading datasets. 

160 This may be a `str` collection name or an iterable thereof. 

161 See :ref:`daf_butler_collection_expressions` for more information. 

162 These collections are not registered automatically and must be 

163 manually registered before they are used by any method, but they may be 

164 manually registered after the `Butler` is initialized. 

165 run : `str`, optional 

166 Name of the `~CollectionType.RUN` collection new datasets should be 

167 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

168 ``collections`` will be set to ``[run]``. If not `None`, this 

169 collection will automatically be registered. If this is not set (and 

170 ``writeable`` is not set either), a read-only butler will be created. 

171 searchPaths : `list` of `str`, optional 

172 Directory paths to search when calculating the full Butler 

173 configuration. Not used if the supplied config is already a 

174 `ButlerConfig`. 

175 writeable : `bool`, optional 

176 Explicitly sets whether the butler supports write operations. If not 

177 provided, a read-write butler is created if any of ``run``, ``tags``, 

178 or ``chains`` is non-empty. 

179 inferDefaults : `bool`, optional 

180 If `True` (default) infer default data ID values from the values 

181 present in the datasets in ``collections``: if all collections have the 

182 same value (or no value) for a governor dimension, that value will be 

183 the default for that dimension. Nonexistent collections are ignored. 

184 If a default value is provided explicitly for a governor dimension via 

185 ``**kwargs``, no default will be inferred for that dimension. 

186 **kwargs : `str` 

187 Default data ID key-value pairs. These may only identify "governor" 

188 dimensions like ``instrument`` and ``skymap``. 

189 

190 Examples 

191 -------- 

192 While there are many ways to control exactly how a `Butler` interacts with 

193 the collections in its `Registry`, the most common cases are still simple. 

194 

195 For a read-only `Butler` that searches one collection, do:: 

196 

197 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

198 

199 For a read-write `Butler` that writes to and reads from a 

200 `~CollectionType.RUN` collection:: 

201 

202 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

203 

204 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

205 because we want to write to one `~CollectionType.RUN` collection but read 

206 from several others (as well):: 

207 

208 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

209 collections=["u/alice/DM-50000/a", 

210 "u/bob/DM-49998", 

211 "HSC/defaults"]) 

212 

213 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

214 Datasets will be read first from that run (since it appears first in the 

215 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

216 

217 Finally, one can always create a `Butler` with no collections:: 

218 

219 butler = Butler("/path/to/repo", writeable=True) 

220 

221 This can be extremely useful when you just want to use ``butler.registry``, 

222 e.g. for inserting dimension data or managing collections, or when the 

223 collections you want to use with the butler are not consistent. 

224 Passing ``writeable`` explicitly here is only necessary if you want to be 

225 able to make changes to the repo - usually the value for ``writeable`` can 

226 be guessed from the collection arguments provided, but it defaults to 

227 `False` when there are not collection arguments. 

228 """ 

229 def __init__(self, config: Union[Config, str, None] = None, *, 

230 butler: Optional[Butler] = None, 

231 collections: Any = None, 

232 run: Optional[str] = None, 

233 searchPaths: Optional[List[str]] = None, 

234 writeable: Optional[bool] = None, 

235 inferDefaults: bool = True, 

236 **kwargs: str, 

237 ): 

238 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

239 # Load registry, datastore, etc. from config or existing butler. 

240 if butler is not None: 

241 if config is not None or searchPaths is not None or writeable is not None: 

242 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

243 "arguments with 'butler' argument.") 

244 self.registry = butler.registry.copy(defaults) 

245 self.datastore = butler.datastore 

246 self.storageClasses = butler.storageClasses 

247 self._config: ButlerConfig = butler._config 

248 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

249 else: 

250 self._config = ButlerConfig(config, searchPaths=searchPaths) 

251 if "root" in self._config: 

252 butlerRoot = self._config["root"] 

253 else: 

254 butlerRoot = self._config.configDir 

255 if writeable is None: 

256 writeable = run is not None 

257 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

258 defaults=defaults) 

259 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

260 butlerRoot=butlerRoot) 

261 self.storageClasses = StorageClassFactory() 

262 self.storageClasses.addFromConfig(self._config) 

263 self._allow_put_of_predefined_dataset = self._config.get("allow_put_of_predefined_dataset", False) 

264 if "run" in self._config or "collection" in self._config: 

265 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

266 

267 GENERATION: ClassVar[int] = 3 

268 """This is a Generation 3 Butler. 

269 

270 This attribute may be removed in the future, once the Generation 2 Butler 

271 interface has been fully retired; it should only be used in transitional 

272 code. 

273 """ 

274 

275 @staticmethod 

276 def makeRepo(root: str, config: Union[Config, str, None] = None, 

277 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

278 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

279 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

280 """Create an empty data repository by adding a butler.yaml config 

281 to a repository root directory. 

282 

283 Parameters 

284 ---------- 

285 root : `str` or `ButlerURI` 

286 Path or URI to the root location of the new repository. Will be 

287 created if it does not exist. 

288 config : `Config` or `str`, optional 

289 Configuration to write to the repository, after setting any 

290 root-dependent Registry or Datastore config options. Can not 

291 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

292 configuration will be used. Root-dependent config options 

293 specified in this config are overwritten if ``forceConfigRoot`` 

294 is `True`. 

295 dimensionConfig : `Config` or `str`, optional 

296 Configuration for dimensions, will be used to initialize registry 

297 database. 

298 standalone : `bool` 

299 If True, write all expanded defaults, not just customized or 

300 repository-specific settings. 

301 This (mostly) decouples the repository from the default 

302 configuration, insulating it from changes to the defaults (which 

303 may be good or bad, depending on the nature of the changes). 

304 Future *additions* to the defaults will still be picked up when 

305 initializing `Butlers` to repos created with ``standalone=True``. 

306 searchPaths : `list` of `str`, optional 

307 Directory paths to search when calculating the full butler 

308 configuration. 

309 forceConfigRoot : `bool`, optional 

310 If `False`, any values present in the supplied ``config`` that 

311 would normally be reset are not overridden and will appear 

312 directly in the output config. This allows non-standard overrides 

313 of the root directory for a datastore or registry to be given. 

314 If this parameter is `True` the values for ``root`` will be 

315 forced into the resulting config if appropriate. 

316 outfile : `str`, optional 

317 If not-`None`, the output configuration will be written to this 

318 location rather than into the repository itself. Can be a URI 

319 string. Can refer to a directory that will be used to write 

320 ``butler.yaml``. 

321 overwrite : `bool`, optional 

322 Create a new configuration file even if one already exists 

323 in the specified output location. Default is to raise 

324 an exception. 

325 

326 Returns 

327 ------- 

328 config : `Config` 

329 The updated `Config` instance written to the repo. 

330 

331 Raises 

332 ------ 

333 ValueError 

334 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

335 regular Config (as these subclasses would make it impossible to 

336 support ``standalone=False``). 

337 FileExistsError 

338 Raised if the output config file already exists. 

339 os.error 

340 Raised if the directory does not exist, exists but is not a 

341 directory, or cannot be created. 

342 

343 Notes 

344 ----- 

345 Note that when ``standalone=False`` (the default), the configuration 

346 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

347 construct the repository should also be used to construct any Butlers 

348 to avoid configuration inconsistencies. 

349 """ 

350 if isinstance(config, (ButlerConfig, ConfigSubset)): 

351 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

352 

353 # Ensure that the root of the repository exists or can be made 

354 uri = ButlerURI(root, forceDirectory=True) 

355 uri.mkdir() 

356 

357 config = Config(config) 

358 

359 # If we are creating a new repo from scratch with relative roots, 

360 # do not propagate an explicit root from the config file 

361 if "root" in config: 

362 del config["root"] 

363 

364 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

365 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"]) 

366 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

367 

368 # if key exists in given config, parse it, otherwise parse the defaults 

369 # in the expanded config 

370 if config.get(("registry", "db")): 

371 registryConfig = RegistryConfig(config) 

372 else: 

373 registryConfig = RegistryConfig(full) 

374 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

375 if defaultDatabaseUri is not None: 

376 Config.updateParameters(RegistryConfig, config, full, 

377 toUpdate={"db": defaultDatabaseUri}, 

378 overwrite=forceConfigRoot) 

379 else: 

380 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

381 overwrite=forceConfigRoot) 

382 

383 if standalone: 

384 config.merge(full) 

385 else: 

386 # Always expand the registry.managers section into the per-repo 

387 # config, because after the database schema is created, it's not 

388 # allowed to change anymore. Note that in the standalone=True 

389 # branch, _everything_ in the config is expanded, so there's no 

390 # need to special case this. 

391 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

392 configURI: Union[str, ButlerURI] 

393 if outfile is not None: 

394 # When writing to a separate location we must include 

395 # the root of the butler repo in the config else it won't know 

396 # where to look. 

397 config["root"] = uri.geturl() 

398 configURI = outfile 

399 else: 

400 configURI = uri 

401 config.dumpToUri(configURI, overwrite=overwrite) 

402 

403 # Create Registry and populate tables 

404 registryConfig = RegistryConfig(config.get("registry")) 

405 dimensionConfig = DimensionConfig(dimensionConfig) 

406 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

407 

408 log.log(VERBOSE, "Wrote new Butler configuration file to %s", configURI) 

409 

410 return config 

411 

412 @classmethod 

413 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

414 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

415 """Callable used to unpickle a Butler. 

416 

417 We prefer not to use ``Butler.__init__`` directly so we can force some 

418 of its many arguments to be keyword-only (note that ``__reduce__`` 

419 can only invoke callables with positional arguments). 

420 

421 Parameters 

422 ---------- 

423 config : `ButlerConfig` 

424 Butler configuration, already coerced into a true `ButlerConfig` 

425 instance (and hence after any search paths for overrides have been 

426 utilized). 

427 collections : `CollectionSearch` 

428 Names of the default collections to read from. 

429 run : `str`, optional 

430 Name of the default `~CollectionType.RUN` collection to write to. 

431 defaultDataId : `dict` [ `str`, `str` ] 

432 Default data ID values. 

433 writeable : `bool` 

434 Whether the Butler should support write operations. 

435 

436 Returns 

437 ------- 

438 butler : `Butler` 

439 A new `Butler` instance. 

440 """ 

441 # MyPy doesn't recognize that the kwargs below are totally valid; it 

442 # seems to think '**defaultDataId* is a _positional_ argument! 

443 return cls(config=config, collections=collections, run=run, writeable=writeable, 

444 **defaultDataId) # type: ignore 

445 

446 def __reduce__(self) -> tuple: 

447 """Support pickling. 

448 """ 

449 return (Butler._unpickle, (self._config, self.collections, self.run, 

450 self.registry.defaults.dataId.byName(), 

451 self.registry.isWriteable())) 

452 

453 def __str__(self) -> str: 

454 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

455 self.collections, self.run, self.datastore, self.registry) 

456 

457 def isWriteable(self) -> bool: 

458 """Return `True` if this `Butler` supports write operations. 

459 """ 

460 return self.registry.isWriteable() 

461 

462 @contextlib.contextmanager 

463 def transaction(self) -> Iterator[None]: 

464 """Context manager supporting `Butler` transactions. 

465 

466 Transactions can be nested. 

467 """ 

468 with self.registry.transaction(): 

469 with self.datastore.transaction(): 

470 yield 

471 

472 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

473 dataId: Optional[DataId] = None, **kwds: Any 

474 ) -> Tuple[DatasetType, Optional[DataId]]: 

475 """Standardize the arguments passed to several Butler APIs. 

476 

477 Parameters 

478 ---------- 

479 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

480 When `DatasetRef` the `dataId` should be `None`. 

481 Otherwise the `DatasetType` or name thereof. 

482 dataId : `dict` or `DataCoordinate` 

483 A `dict` of `Dimension` link name, value pairs that label the 

484 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

485 should be provided as the second argument. 

486 kwds 

487 Additional keyword arguments used to augment or construct a 

488 `DataCoordinate`. See `DataCoordinate.standardize` 

489 parameters. 

490 

491 Returns 

492 ------- 

493 datasetType : `DatasetType` 

494 A `DatasetType` instance extracted from ``datasetRefOrType``. 

495 dataId : `dict` or `DataId`, optional 

496 Argument that can be used (along with ``kwds``) to construct a 

497 `DataId`. 

498 

499 Notes 

500 ----- 

501 Butler APIs that conceptually need a DatasetRef also allow passing a 

502 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

503 keyword arguments that can be used to construct one) separately. This 

504 method accepts those arguments and always returns a true `DatasetType` 

505 and a `DataId` or `dict`. 

506 

507 Standardization of `dict` vs `DataId` is best handled by passing the 

508 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are 

509 generally similarly flexible. 

510 """ 

511 externalDatasetType: Optional[DatasetType] = None 

512 internalDatasetType: Optional[DatasetType] = None 

513 if isinstance(datasetRefOrType, DatasetRef): 

514 if dataId is not None or kwds: 

515 raise ValueError("DatasetRef given, cannot use dataId as well") 

516 externalDatasetType = datasetRefOrType.datasetType 

517 dataId = datasetRefOrType.dataId 

518 else: 

519 # Don't check whether DataId is provided, because Registry APIs 

520 # can usually construct a better error message when it wasn't. 

521 if isinstance(datasetRefOrType, DatasetType): 

522 externalDatasetType = datasetRefOrType 

523 else: 

524 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

525 

526 # Check that they are self-consistent 

527 if externalDatasetType is not None: 

528 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

529 if externalDatasetType != internalDatasetType: 

530 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

531 f"registry definition ({internalDatasetType})") 

532 

533 assert internalDatasetType is not None 

534 return internalDatasetType, dataId 

535 

536 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

537 dataId: Optional[DataId] = None, *, 

538 collections: Any = None, 

539 allowUnresolved: bool = False, 

540 **kwds: Any) -> DatasetRef: 

541 """Shared logic for methods that start with a search for a dataset in 

542 the registry. 

543 

544 Parameters 

545 ---------- 

546 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

547 When `DatasetRef` the `dataId` should be `None`. 

548 Otherwise the `DatasetType` or name thereof. 

549 dataId : `dict` or `DataCoordinate`, optional 

550 A `dict` of `Dimension` link name, value pairs that label the 

551 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

552 should be provided as the first argument. 

553 collections : Any, optional 

554 Collections to be searched, overriding ``self.collections``. 

555 Can be any of the types supported by the ``collections`` argument 

556 to butler construction. 

557 allowUnresolved : `bool`, optional 

558 If `True`, return an unresolved `DatasetRef` if finding a resolved 

559 one in the `Registry` fails. Defaults to `False`. 

560 kwds 

561 Additional keyword arguments used to augment or construct a 

562 `DataId`. See `DataId` parameters. 

563 

564 Returns 

565 ------- 

566 ref : `DatasetRef` 

567 A reference to the dataset identified by the given arguments. 

568 

569 Raises 

570 ------ 

571 LookupError 

572 Raised if no matching dataset exists in the `Registry` (and 

573 ``allowUnresolved is False``). 

574 ValueError 

575 Raised if a resolved `DatasetRef` was passed as an input, but it 

576 differs from the one found in the registry. 

577 TypeError 

578 Raised if no collections were provided. 

579 """ 

580 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

581 if isinstance(datasetRefOrType, DatasetRef): 

582 idNumber = datasetRefOrType.id 

583 else: 

584 idNumber = None 

585 timespan: Optional[Timespan] = None 

586 

587 # Process dimension records that are using record information 

588 # rather than ids 

589 newDataId: Dict[str, DataIdValue] = {} 

590 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

591 

592 # if all the dataId comes from keyword parameters we do not need 

593 # to do anything here because they can't be of the form 

594 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

595 if dataId: 

596 for k, v in dataId.items(): 

597 # If we have a Dimension we do not need to do anything 

598 # because it cannot be a compound key. 

599 if isinstance(k, str) and "." in k: 

600 # Someone is using a more human-readable dataId 

601 dimensionName, record = k.split(".", 1) 

602 byRecord[dimensionName][record] = v 

603 elif isinstance(k, Dimension): 

604 newDataId[k.name] = v 

605 else: 

606 newDataId[k] = v 

607 

608 # Go through the updated dataId and check the type in case someone is 

609 # using an alternate key. We have already filtered out the compound 

610 # keys dimensions.record format. 

611 not_dimensions = {} 

612 

613 # Will need to look in the dataId and the keyword arguments 

614 # and will remove them if they need to be fixed or are unrecognized. 

615 for dataIdDict in (newDataId, kwds): 

616 # Use a list so we can adjust the dict safely in the loop 

617 for dimensionName in list(dataIdDict): 

618 value = dataIdDict[dimensionName] 

619 try: 

620 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

621 except KeyError: 

622 # This is not a real dimension 

623 not_dimensions[dimensionName] = value 

624 del dataIdDict[dimensionName] 

625 continue 

626 

627 # Convert an integral type to an explicit int to simplify 

628 # comparisons here 

629 if isinstance(value, numbers.Integral): 

630 value = int(value) 

631 

632 if not isinstance(value, dimension.primaryKey.getPythonType()): 

633 for alternate in dimension.alternateKeys: 

634 if isinstance(value, alternate.getPythonType()): 

635 byRecord[dimensionName][alternate.name] = value 

636 del dataIdDict[dimensionName] 

637 log.debug("Converting dimension %s to %s.%s=%s", 

638 dimensionName, dimensionName, alternate.name, value) 

639 break 

640 else: 

641 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

642 "Could not find matching alternative (primary key has type %s) " 

643 "so attempting to use as-is.", 

644 value, dimensionName, dimension.primaryKey.getPythonType()) 

645 

646 # If we have some unrecognized dimensions we have to try to connect 

647 # them to records in other dimensions. This is made more complicated 

648 # by some dimensions having records with clashing names. A mitigation 

649 # is that we can tell by this point which dimensions are missing 

650 # for the DatasetType but this does not work for calibrations 

651 # where additional dimensions can be used to constrain the temporal 

652 # axis. 

653 if not_dimensions: 

654 # Calculate missing dimensions 

655 provided = set(newDataId) | set(kwds) | set(byRecord) 

656 missingDimensions = datasetType.dimensions.names - provided 

657 

658 # For calibrations we may well be needing temporal dimensions 

659 # so rather than always including all dimensions in the scan 

660 # restrict things a little. It is still possible for there 

661 # to be confusion over day_obs in visit vs exposure for example. 

662 # If we are not searching calibration collections things may 

663 # fail but they are going to fail anyway because of the 

664 # ambiguousness of the dataId... 

665 candidateDimensions: Set[str] = set() 

666 candidateDimensions.update(missingDimensions) 

667 if datasetType.isCalibration(): 

668 for dim in self.registry.dimensions.getStaticDimensions(): 

669 if dim.temporal: 

670 candidateDimensions.add(str(dim)) 

671 

672 # Look up table for the first association with a dimension 

673 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

674 

675 # Keep track of whether an item is associated with multiple 

676 # dimensions. 

677 counter: Counter[str] = Counter() 

678 assigned: Dict[str, Set[str]] = defaultdict(set) 

679 

680 # Go through the missing dimensions and associate the 

681 # given names with records within those dimensions 

682 for dimensionName in candidateDimensions: 

683 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

684 fields = dimension.metadata.names | dimension.uniqueKeys.names 

685 for field in not_dimensions: 

686 if field in fields: 

687 guessedAssociation[dimensionName][field] = not_dimensions[field] 

688 counter[dimensionName] += 1 

689 assigned[field].add(dimensionName) 

690 

691 # There is a chance we have allocated a single dataId item 

692 # to multiple dimensions. Need to decide which should be retained. 

693 # For now assume that the most popular alternative wins. 

694 # This means that day_obs with seq_num will result in 

695 # exposure.day_obs and not visit.day_obs 

696 # Also prefer an explicitly missing dimension over an inferred 

697 # temporal dimension. 

698 for fieldName, assignedDimensions in assigned.items(): 

699 if len(assignedDimensions) > 1: 

700 # Pick the most popular (preferring mandatory dimensions) 

701 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

702 if requiredButMissing: 

703 candidateDimensions = requiredButMissing 

704 else: 

705 candidateDimensions = assignedDimensions 

706 

707 # Select the relevant items and get a new restricted 

708 # counter. 

709 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

710 duplicatesCounter: Counter[str] = Counter() 

711 duplicatesCounter.update(theseCounts) 

712 

713 # Choose the most common. If they are equally common 

714 # we will pick the one that was found first. 

715 # Returns a list of tuples 

716 selected = duplicatesCounter.most_common(1)[0][0] 

717 

718 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

719 " Removed ambiguity by choosing dimension %s.", 

720 fieldName, ", ".join(assignedDimensions), selected) 

721 

722 for candidateDimension in assignedDimensions: 

723 if candidateDimension != selected: 

724 del guessedAssociation[candidateDimension][fieldName] 

725 

726 # Update the record look up dict with the new associations 

727 for dimensionName, values in guessedAssociation.items(): 

728 if values: # A dict might now be empty 

729 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

730 dimensionName, values) 

731 byRecord[dimensionName].update(values) 

732 

733 if byRecord: 

734 # Some record specifiers were found so we need to convert 

735 # them to the Id form 

736 for dimensionName, values in byRecord.items(): 

737 if dimensionName in newDataId: 

738 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

739 " general record specifiers for it of %s. Ignoring record information.", 

740 dimensionName, newDataId[dimensionName], str(values)) 

741 continue 

742 

743 # Build up a WHERE expression -- use single quotes 

744 def quote(s: Any) -> str: 

745 if isinstance(s, str): 

746 return f"'{s}'" 

747 else: 

748 return s 

749 

750 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}" 

751 for k, v in values.items()) 

752 

753 # Hopefully we get a single record that matches 

754 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

755 where=where, **kwds)) 

756 

757 if len(records) != 1: 

758 if len(records) > 1: 

759 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

760 for r in records: 

761 log.debug("- %s", str(r)) 

762 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

763 f" uniquely constrained to a single dataset by {values}." 

764 f" Got {len(records)} results.") 

765 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

766 f" records when constrained by {values}") 

767 

768 # Get the primary key from the real dimension object 

769 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

770 if not isinstance(dimension, Dimension): 

771 raise RuntimeError( 

772 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

773 ) 

774 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

775 

776 # We have modified the dataId so need to switch to it 

777 dataId = newDataId 

778 

779 if datasetType.isCalibration(): 

780 # Because this is a calibration dataset, first try to make a 

781 # standardize the data ID without restricting the dimensions to 

782 # those of the dataset type requested, because there may be extra 

783 # dimensions that provide temporal information for a validity-range 

784 # lookup. 

785 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

786 defaults=self.registry.defaults.dataId, **kwds) 

787 if dataId.graph.temporal: 

788 dataId = self.registry.expandDataId(dataId) 

789 timespan = dataId.timespan 

790 else: 

791 # Standardize the data ID to just the dimensions of the dataset 

792 # type instead of letting registry.findDataset do it, so we get the 

793 # result even if no dataset is found. 

794 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

795 defaults=self.registry.defaults.dataId, **kwds) 

796 # Always lookup the DatasetRef, even if one is given, to ensure it is 

797 # present in the current collection. 

798 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

799 if ref is None: 

800 if allowUnresolved: 

801 return DatasetRef(datasetType, dataId) 

802 else: 

803 if collections is None: 

804 collections = self.registry.defaults.collections 

805 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

806 f"could not be found in collections {collections}.") 

807 if idNumber is not None and idNumber != ref.id: 

808 if collections is None: 

809 collections = self.registry.defaults.collections 

810 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

811 f"id ({ref.id}) in registry in collections {collections}.") 

812 return ref 

813 

814 @transactional 

815 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

816 dataId: Optional[DataId] = None, *, 

817 run: Optional[str] = None, 

818 **kwds: Any) -> DatasetRef: 

819 """Store and register a dataset. 

820 

821 Parameters 

822 ---------- 

823 obj : `object` 

824 The dataset. 

825 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

826 When `DatasetRef` is provided, ``dataId`` should be `None`. 

827 Otherwise the `DatasetType` or name thereof. 

828 dataId : `dict` or `DataCoordinate` 

829 A `dict` of `Dimension` link name, value pairs that label the 

830 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

831 should be provided as the second argument. 

832 run : `str`, optional 

833 The name of the run the dataset should be added to, overriding 

834 ``self.run``. 

835 kwds 

836 Additional keyword arguments used to augment or construct a 

837 `DataCoordinate`. See `DataCoordinate.standardize` 

838 parameters. 

839 

840 Returns 

841 ------- 

842 ref : `DatasetRef` 

843 A reference to the stored dataset, updated with the correct id if 

844 given. 

845 

846 Raises 

847 ------ 

848 TypeError 

849 Raised if the butler is read-only or if no run has been provided. 

850 """ 

851 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

852 if not self.isWriteable(): 

853 raise TypeError("Butler is read-only.") 

854 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

855 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

856 raise ValueError("DatasetRef must not be in registry, must have None id") 

857 

858 # Add Registry Dataset entry. 

859 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds) 

860 

861 # For an execution butler the datasets will be pre-defined. 

862 # If the butler is configured that way datasets should only be inserted 

863 # if they do not already exist in registry. Trying and catching 

864 # ConflictingDefinitionError will not work because the transaction 

865 # will be corrupted. Instead, in this mode always check first. 

866 ref = None 

867 ref_is_predefined = False 

868 if self._allow_put_of_predefined_dataset: 

869 # Get the matching ref for this run. 

870 ref = self.registry.findDataset(datasetType, collections=run, 

871 dataId=dataId) 

872 

873 if ref: 

874 # Must be expanded form for datastore templating 

875 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

876 ref = ref.expanded(dataId) 

877 ref_is_predefined = True 

878 

879 if not ref: 

880 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

881 

882 # If the ref is predefined it is possible that the datastore also 

883 # has the record. Asking datastore to put it again will result in 

884 # the artifact being recreated, overwriting previous, then will cause 

885 # a failure in writing the record which will cause the artifact 

886 # to be removed. Much safer to ask first before attempting to 

887 # overwrite. Race conditions should not be an issue for the 

888 # execution butler environment. 

889 if ref_is_predefined: 

890 if self.datastore.knows(ref): 

891 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

892 

893 self.datastore.put(obj, ref) 

894 

895 return ref 

896 

897 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

898 """Retrieve a stored dataset. 

899 

900 Unlike `Butler.get`, this method allows datasets outside the Butler's 

901 collection to be read as long as the `DatasetRef` that identifies them 

902 can be obtained separately. 

903 

904 Parameters 

905 ---------- 

906 ref : `DatasetRef` 

907 Resolved reference to an already stored dataset. 

908 parameters : `dict` 

909 Additional StorageClass-defined options to control reading, 

910 typically used to efficiently read only a subset of the dataset. 

911 

912 Returns 

913 ------- 

914 obj : `object` 

915 The dataset. 

916 """ 

917 return self.datastore.get(ref, parameters=parameters) 

918 

919 def getDirectDeferred(self, ref: DatasetRef, *, 

920 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

921 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

922 from a resolved `DatasetRef`. 

923 

924 Parameters 

925 ---------- 

926 ref : `DatasetRef` 

927 Resolved reference to an already stored dataset. 

928 parameters : `dict` 

929 Additional StorageClass-defined options to control reading, 

930 typically used to efficiently read only a subset of the dataset. 

931 

932 Returns 

933 ------- 

934 obj : `DeferredDatasetHandle` 

935 A handle which can be used to retrieve a dataset at a later time. 

936 

937 Raises 

938 ------ 

939 AmbiguousDatasetError 

940 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

941 """ 

942 if ref.id is None: 

943 raise AmbiguousDatasetError( 

944 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

945 ) 

946 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

947 

948 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

949 dataId: Optional[DataId] = None, *, 

950 parameters: Union[dict, None] = None, 

951 collections: Any = None, 

952 **kwds: Any) -> DeferredDatasetHandle: 

953 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

954 after an immediate registry lookup. 

955 

956 Parameters 

957 ---------- 

958 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

959 When `DatasetRef` the `dataId` should be `None`. 

960 Otherwise the `DatasetType` or name thereof. 

961 dataId : `dict` or `DataCoordinate`, optional 

962 A `dict` of `Dimension` link name, value pairs that label the 

963 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

964 should be provided as the first argument. 

965 parameters : `dict` 

966 Additional StorageClass-defined options to control reading, 

967 typically used to efficiently read only a subset of the dataset. 

968 collections : Any, optional 

969 Collections to be searched, overriding ``self.collections``. 

970 Can be any of the types supported by the ``collections`` argument 

971 to butler construction. 

972 kwds 

973 Additional keyword arguments used to augment or construct a 

974 `DataId`. See `DataId` parameters. 

975 

976 Returns 

977 ------- 

978 obj : `DeferredDatasetHandle` 

979 A handle which can be used to retrieve a dataset at a later time. 

980 

981 Raises 

982 ------ 

983 LookupError 

984 Raised if no matching dataset exists in the `Registry` (and 

985 ``allowUnresolved is False``). 

986 ValueError 

987 Raised if a resolved `DatasetRef` was passed as an input, but it 

988 differs from the one found in the registry. 

989 TypeError 

990 Raised if no collections were provided. 

991 """ 

992 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

993 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

994 

995 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

996 dataId: Optional[DataId] = None, *, 

997 parameters: Optional[Dict[str, Any]] = None, 

998 collections: Any = None, 

999 **kwds: Any) -> Any: 

1000 """Retrieve a stored dataset. 

1001 

1002 Parameters 

1003 ---------- 

1004 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1005 When `DatasetRef` the `dataId` should be `None`. 

1006 Otherwise the `DatasetType` or name thereof. 

1007 dataId : `dict` or `DataCoordinate` 

1008 A `dict` of `Dimension` link name, value pairs that label the 

1009 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1010 should be provided as the first argument. 

1011 parameters : `dict` 

1012 Additional StorageClass-defined options to control reading, 

1013 typically used to efficiently read only a subset of the dataset. 

1014 collections : Any, optional 

1015 Collections to be searched, overriding ``self.collections``. 

1016 Can be any of the types supported by the ``collections`` argument 

1017 to butler construction. 

1018 kwds 

1019 Additional keyword arguments used to augment or construct a 

1020 `DataCoordinate`. See `DataCoordinate.standardize` 

1021 parameters. 

1022 

1023 Returns 

1024 ------- 

1025 obj : `object` 

1026 The dataset. 

1027 

1028 Raises 

1029 ------ 

1030 ValueError 

1031 Raised if a resolved `DatasetRef` was passed as an input, but it 

1032 differs from the one found in the registry. 

1033 LookupError 

1034 Raised if no matching dataset exists in the `Registry`. 

1035 TypeError 

1036 Raised if no collections were provided. 

1037 

1038 Notes 

1039 ----- 

1040 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1041 this method requires that the given data ID include temporal dimensions 

1042 beyond the dimensions of the dataset type itself, in order to find the 

1043 dataset with the appropriate validity range. For example, a "bias" 

1044 dataset with native dimensions ``{instrument, detector}`` could be 

1045 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1046 ``exposure`` is a temporal dimension. 

1047 """ 

1048 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1049 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1050 return self.getDirect(ref, parameters=parameters) 

1051 

1052 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1053 dataId: Optional[DataId] = None, *, 

1054 predict: bool = False, 

1055 collections: Any = None, 

1056 run: Optional[str] = None, 

1057 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1058 """Returns the URIs associated with the dataset. 

1059 

1060 Parameters 

1061 ---------- 

1062 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1063 When `DatasetRef` the `dataId` should be `None`. 

1064 Otherwise the `DatasetType` or name thereof. 

1065 dataId : `dict` or `DataCoordinate` 

1066 A `dict` of `Dimension` link name, value pairs that label the 

1067 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1068 should be provided as the first argument. 

1069 predict : `bool` 

1070 If `True`, allow URIs to be returned of datasets that have not 

1071 been written. 

1072 collections : Any, optional 

1073 Collections to be searched, overriding ``self.collections``. 

1074 Can be any of the types supported by the ``collections`` argument 

1075 to butler construction. 

1076 run : `str`, optional 

1077 Run to use for predictions, overriding ``self.run``. 

1078 kwds 

1079 Additional keyword arguments used to augment or construct a 

1080 `DataCoordinate`. See `DataCoordinate.standardize` 

1081 parameters. 

1082 

1083 Returns 

1084 ------- 

1085 primary : `ButlerURI` 

1086 The URI to the primary artifact associated with this dataset. 

1087 If the dataset was disassembled within the datastore this 

1088 may be `None`. 

1089 components : `dict` 

1090 URIs to any components associated with the dataset artifact. 

1091 Can be empty if there are no components. 

1092 """ 

1093 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1094 collections=collections, **kwds) 

1095 if ref.id is None: # only possible if predict is True 

1096 if run is None: 

1097 run = self.run 

1098 if run is None: 

1099 raise TypeError("Cannot predict location with run=None.") 

1100 # Lie about ID, because we can't guess it, and only 

1101 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1102 ref = ref.resolved(id=0, run=run) 

1103 return self.datastore.getURIs(ref, predict) 

1104 

1105 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1106 dataId: Optional[DataId] = None, *, 

1107 predict: bool = False, 

1108 collections: Any = None, 

1109 run: Optional[str] = None, 

1110 **kwds: Any) -> ButlerURI: 

1111 """Return the URI to the Dataset. 

1112 

1113 Parameters 

1114 ---------- 

1115 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1116 When `DatasetRef` the `dataId` should be `None`. 

1117 Otherwise the `DatasetType` or name thereof. 

1118 dataId : `dict` or `DataCoordinate` 

1119 A `dict` of `Dimension` link name, value pairs that label the 

1120 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1121 should be provided as the first argument. 

1122 predict : `bool` 

1123 If `True`, allow URIs to be returned of datasets that have not 

1124 been written. 

1125 collections : Any, optional 

1126 Collections to be searched, overriding ``self.collections``. 

1127 Can be any of the types supported by the ``collections`` argument 

1128 to butler construction. 

1129 run : `str`, optional 

1130 Run to use for predictions, overriding ``self.run``. 

1131 kwds 

1132 Additional keyword arguments used to augment or construct a 

1133 `DataCoordinate`. See `DataCoordinate.standardize` 

1134 parameters. 

1135 

1136 Returns 

1137 ------- 

1138 uri : `ButlerURI` 

1139 URI pointing to the Dataset within the datastore. If the 

1140 Dataset does not exist in the datastore, and if ``predict`` is 

1141 `True`, the URI will be a prediction and will include a URI 

1142 fragment "#predicted". 

1143 If the datastore does not have entities that relate well 

1144 to the concept of a URI the returned URI string will be 

1145 descriptive. The returned URI is not guaranteed to be obtainable. 

1146 

1147 Raises 

1148 ------ 

1149 LookupError 

1150 A URI has been requested for a dataset that does not exist and 

1151 guessing is not allowed. 

1152 ValueError 

1153 Raised if a resolved `DatasetRef` was passed as an input, but it 

1154 differs from the one found in the registry. 

1155 TypeError 

1156 Raised if no collections were provided. 

1157 RuntimeError 

1158 Raised if a URI is requested for a dataset that consists of 

1159 multiple artifacts. 

1160 """ 

1161 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1162 collections=collections, run=run, **kwds) 

1163 

1164 if primary is None or components: 

1165 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1166 "Use Butler.getURIs() instead.") 

1167 return primary 

1168 

1169 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1170 destination: Union[str, ButlerURI], transfer: str = "auto", 

1171 preserve_path: bool = True, 

1172 overwrite: bool = False) -> List[ButlerURI]: 

1173 """Retrieve the artifacts associated with the supplied refs. 

1174 

1175 Parameters 

1176 ---------- 

1177 refs : iterable of `DatasetRef` 

1178 The datasets for which artifacts are to be retrieved. 

1179 A single ref can result in multiple artifacts. The refs must 

1180 be resolved. 

1181 destination : `ButlerURI` or `str` 

1182 Location to write the artifacts. 

1183 transfer : `str`, optional 

1184 Method to use to transfer the artifacts. Must be one of the options 

1185 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1186 preserve_path : `bool`, optional 

1187 If `True` the full path of the artifact within the datastore 

1188 is preserved. If `False` the final file component of the path 

1189 is used. 

1190 overwrite : `bool`, optional 

1191 If `True` allow transfers to overwrite existing files at the 

1192 destination. 

1193 

1194 Returns 

1195 ------- 

1196 targets : `list` of `ButlerURI` 

1197 URIs of file artifacts in destination location. Order is not 

1198 preserved. 

1199 

1200 Notes 

1201 ----- 

1202 For non-file datastores the artifacts written to the destination 

1203 may not match the representation inside the datastore. For example 

1204 a hierarchical data structure in a NoSQL database may well be stored 

1205 as a JSON file. 

1206 """ 

1207 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer, 

1208 preserve_path=preserve_path, overwrite=overwrite) 

1209 

1210 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1211 dataId: Optional[DataId] = None, *, 

1212 collections: Any = None, 

1213 **kwds: Any) -> bool: 

1214 """Return True if the Dataset is actually present in the Datastore. 

1215 

1216 Parameters 

1217 ---------- 

1218 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1219 When `DatasetRef` the `dataId` should be `None`. 

1220 Otherwise the `DatasetType` or name thereof. 

1221 dataId : `dict` or `DataCoordinate` 

1222 A `dict` of `Dimension` link name, value pairs that label the 

1223 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1224 should be provided as the first argument. 

1225 collections : Any, optional 

1226 Collections to be searched, overriding ``self.collections``. 

1227 Can be any of the types supported by the ``collections`` argument 

1228 to butler construction. 

1229 kwds 

1230 Additional keyword arguments used to augment or construct a 

1231 `DataCoordinate`. See `DataCoordinate.standardize` 

1232 parameters. 

1233 

1234 Raises 

1235 ------ 

1236 LookupError 

1237 Raised if the dataset is not even present in the Registry. 

1238 ValueError 

1239 Raised if a resolved `DatasetRef` was passed as an input, but it 

1240 differs from the one found in the registry. 

1241 TypeError 

1242 Raised if no collections were provided. 

1243 """ 

1244 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1245 return self.datastore.exists(ref) 

1246 

1247 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1248 """Remove one or more `~CollectionType.RUN` collections and the 

1249 datasets within them. 

1250 

1251 Parameters 

1252 ---------- 

1253 names : `Iterable` [ `str` ] 

1254 The names of the collections to remove. 

1255 unstore : `bool`, optional 

1256 If `True` (default), delete datasets from all datastores in which 

1257 they are present, and attempt to rollback the registry deletions if 

1258 datastore deletions fail (which may not always be possible). If 

1259 `False`, datastore records for these datasets are still removed, 

1260 but any artifacts (e.g. files) will not be. 

1261 

1262 Raises 

1263 ------ 

1264 TypeError 

1265 Raised if one or more collections are not of type 

1266 `~CollectionType.RUN`. 

1267 """ 

1268 if not self.isWriteable(): 

1269 raise TypeError("Butler is read-only.") 

1270 names = list(names) 

1271 refs: List[DatasetRef] = [] 

1272 for name in names: 

1273 collectionType = self.registry.getCollectionType(name) 

1274 if collectionType is not CollectionType.RUN: 

1275 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1276 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1277 with self.registry.transaction(): 

1278 if unstore: 

1279 self.datastore.trash(refs) 

1280 else: 

1281 self.datastore.forget(refs) 

1282 for name in names: 

1283 self.registry.removeCollection(name) 

1284 if unstore: 

1285 # Point of no return for removing artifacts 

1286 self.datastore.emptyTrash() 

1287 

1288 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False, 

1289 unlink: Optional[List[str]] = None) -> None: 

1290 """Remove a collection and possibly prune datasets within it. 

1291 

1292 Parameters 

1293 ---------- 

1294 name : `str` 

1295 Name of the collection to remove. If this is a 

1296 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1297 datasets within the collection are not modified unless ``unstore`` 

1298 is `True`. If this is a `~CollectionType.RUN` collection, 

1299 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1300 are fully removed from the data repository. 

1301 purge : `bool`, optional 

1302 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1303 fully removing datasets within them. Requires ``unstore=True`` as 

1304 well as an added precaution against accidental deletion. Must be 

1305 `False` (default) if the collection is not a ``RUN``. 

1306 unstore: `bool`, optional 

1307 If `True`, remove all datasets in the collection from all 

1308 datastores in which they appear. 

1309 unlink: `list` [`str`], optional 

1310 Before removing the given `collection` unlink it from from these 

1311 parent collections. 

1312 

1313 Raises 

1314 ------ 

1315 TypeError 

1316 Raised if the butler is read-only or arguments are mutually 

1317 inconsistent. 

1318 """ 

1319 # See pruneDatasets comments for more information about the logic here; 

1320 # the cases are almost the same, but here we can rely on Registry to 

1321 # take care everything but Datastore deletion when we remove the 

1322 # collection. 

1323 if not self.isWriteable(): 

1324 raise TypeError("Butler is read-only.") 

1325 collectionType = self.registry.getCollectionType(name) 

1326 if purge and not unstore: 

1327 raise PurgeWithoutUnstorePruneCollectionsError() 

1328 if collectionType is CollectionType.RUN and not purge: 

1329 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1330 if collectionType is not CollectionType.RUN and purge: 

1331 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1332 

1333 def remove(child: str, parent: str) -> None: 

1334 """Remove a child collection from a parent collection.""" 

1335 # Remove child from parent. 

1336 chain = list(self.registry.getCollectionChain(parent)) 

1337 try: 

1338 chain.remove(name) 

1339 except ValueError as e: 

1340 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1341 self.registry.setCollectionChain(parent, chain) 

1342 

1343 with self.registry.transaction(): 

1344 if (unlink): 

1345 for parent in unlink: 

1346 remove(name, parent) 

1347 if unstore: 

1348 refs = self.registry.queryDatasets(..., collections=name, findFirst=True) 

1349 self.datastore.trash(refs) 

1350 self.registry.removeCollection(name) 

1351 

1352 if unstore: 

1353 # Point of no return for removing artifacts 

1354 self.datastore.emptyTrash() 

1355 

1356 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1357 disassociate: bool = True, 

1358 unstore: bool = False, 

1359 tags: Iterable[str] = (), 

1360 purge: bool = False, 

1361 run: Optional[str] = None) -> None: 

1362 """Remove one or more datasets from a collection and/or storage. 

1363 

1364 Parameters 

1365 ---------- 

1366 refs : `~collections.abc.Iterable` of `DatasetRef` 

1367 Datasets to prune. These must be "resolved" references (not just 

1368 a `DatasetType` and data ID). 

1369 disassociate : `bool`, optional 

1370 Disassociate pruned datasets from ``tags``, or from all collections 

1371 if ``purge=True``. 

1372 unstore : `bool`, optional 

1373 If `True` (`False` is default) remove these datasets from all 

1374 datastores known to this butler. Note that this will make it 

1375 impossible to retrieve these datasets even via other collections. 

1376 Datasets that are already not stored are ignored by this option. 

1377 tags : `Iterable` [ `str` ], optional 

1378 `~CollectionType.TAGGED` collections to disassociate the datasets 

1379 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1380 `True`. 

1381 purge : `bool`, optional 

1382 If `True` (`False` is default), completely remove the dataset from 

1383 the `Registry`. To prevent accidental deletions, ``purge`` may 

1384 only be `True` if all of the following conditions are met: 

1385 

1386 - All given datasets are in the given run. 

1387 - ``disassociate`` is `True`; 

1388 - ``unstore`` is `True`. 

1389 

1390 This mode may remove provenance information from datasets other 

1391 than those provided, and should be used with extreme care. 

1392 

1393 Raises 

1394 ------ 

1395 TypeError 

1396 Raised if the butler is read-only, if no collection was provided, 

1397 or the conditions for ``purge=True`` were not met. 

1398 """ 

1399 if not self.isWriteable(): 

1400 raise TypeError("Butler is read-only.") 

1401 if purge: 

1402 if not disassociate: 

1403 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1404 if not unstore: 

1405 raise TypeError("Cannot pass purge=True without unstore=True.") 

1406 elif disassociate: 

1407 tags = tuple(tags) 

1408 if not tags: 

1409 raise TypeError("No tags provided but disassociate=True.") 

1410 for tag in tags: 

1411 collectionType = self.registry.getCollectionType(tag) 

1412 if collectionType is not CollectionType.TAGGED: 

1413 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1414 f"of non-TAGGED type {collectionType.name}.") 

1415 # Transform possibly-single-pass iterable into something we can iterate 

1416 # over multiple times. 

1417 refs = list(refs) 

1418 # Pruning a component of a DatasetRef makes no sense since registry 

1419 # doesn't know about components and datastore might not store 

1420 # components in a separate file 

1421 for ref in refs: 

1422 if ref.datasetType.component(): 

1423 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1424 # We don't need an unreliable Datastore transaction for this, because 

1425 # we've been extra careful to ensure that Datastore.trash only involves 

1426 # mutating the Registry (it can _look_ at Datastore-specific things, 

1427 # but shouldn't change them), and hence all operations here are 

1428 # Registry operations. 

1429 with self.registry.transaction(): 

1430 if unstore: 

1431 self.datastore.trash(refs) 

1432 if purge: 

1433 self.registry.removeDatasets(refs) 

1434 elif disassociate: 

1435 assert tags, "Guaranteed by earlier logic in this function." 

1436 for tag in tags: 

1437 self.registry.disassociate(tag, refs) 

1438 # We've exited the Registry transaction, and apparently committed. 

1439 # (if there was an exception, everything rolled back, and it's as if 

1440 # nothing happened - and we never get here). 

1441 # Datastore artifacts are not yet gone, but they're clearly marked 

1442 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1443 # problems we can try again later, and if manual administrative 

1444 # intervention is required, it's pretty clear what that should entail: 

1445 # deleting everything on disk and in private Datastore tables that is 

1446 # in the dataset_location_trash table. 

1447 if unstore: 

1448 # Point of no return for removing artifacts 

1449 self.datastore.emptyTrash() 

1450 

1451 @transactional 

1452 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1453 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1454 ) -> None: 

1455 """Store and register one or more datasets that already exist on disk. 

1456 

1457 Parameters 

1458 ---------- 

1459 datasets : `FileDataset` 

1460 Each positional argument is a struct containing information about 

1461 a file to be ingested, including its path (either absolute or 

1462 relative to the datastore root, if applicable), a `DatasetRef`, 

1463 and optionally a formatter class or its fully-qualified string 

1464 name. If a formatter is not provided, the formatter that would be 

1465 used for `put` is assumed. On successful return, all 

1466 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1467 attribute populated and all `FileDataset.formatter` attributes will 

1468 be set to the formatter class used. `FileDataset.path` attributes 

1469 may be modified to put paths in whatever the datastore considers a 

1470 standardized form. 

1471 transfer : `str`, optional 

1472 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1473 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer 

1474 the file. 

1475 run : `str`, optional 

1476 The name of the run ingested datasets should be added to, 

1477 overriding ``self.run``. 

1478 idGenerationMode : `DatasetIdGenEnum`, optional 

1479 Specifies option for generating dataset IDs. By default unique IDs 

1480 are generated for each inserted dataset. 

1481 

1482 Raises 

1483 ------ 

1484 TypeError 

1485 Raised if the butler is read-only or if no run was provided. 

1486 NotImplementedError 

1487 Raised if the `Datastore` does not support the given transfer mode. 

1488 DatasetTypeNotSupportedError 

1489 Raised if one or more files to be ingested have a dataset type that 

1490 is not supported by the `Datastore`.. 

1491 FileNotFoundError 

1492 Raised if one of the given files does not exist. 

1493 FileExistsError 

1494 Raised if transfer is not `None` but the (internal) location the 

1495 file would be moved to is already occupied. 

1496 

1497 Notes 

1498 ----- 

1499 This operation is not fully exception safe: if a database operation 

1500 fails, the given `FileDataset` instances may be only partially updated. 

1501 

1502 It is atomic in terms of database operations (they will either all 

1503 succeed or all fail) providing the database engine implements 

1504 transactions correctly. It will attempt to be atomic in terms of 

1505 filesystem operations as well, but this cannot be implemented 

1506 rigorously for most datastores. 

1507 """ 

1508 if not self.isWriteable(): 

1509 raise TypeError("Butler is read-only.") 

1510 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1511 # Reorganize the inputs so they're grouped by DatasetType and then 

1512 # data ID. We also include a list of DatasetRefs for each FileDataset 

1513 # to hold the resolved DatasetRefs returned by the Registry, before 

1514 # it's safe to swap them into FileDataset.refs. 

1515 # Some type annotation aliases to make that clearer: 

1516 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1517 GroupedData = MutableMapping[DatasetType, GroupForType] 

1518 # The actual data structure: 

1519 groupedData: GroupedData = defaultdict(dict) 

1520 # And the nested loop that populates it: 

1521 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1522 # This list intentionally shared across the inner loop, since it's 

1523 # associated with `dataset`. 

1524 resolvedRefs: List[DatasetRef] = [] 

1525 for ref in dataset.refs: 

1526 if ref.dataId in groupedData[ref.datasetType]: 

1527 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same" 

1528 " DataId as other ingest dataset" 

1529 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1530 f" ({ref.dataId})") 

1531 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1532 

1533 # Now we can bulk-insert into Registry for each DatasetType. 

1534 allResolvedRefs: List[DatasetRef] = [] 

1535 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(), 

1536 desc="Bulk-inserting datasets by type"): 

1537 refs = self.registry.insertDatasets( 

1538 datasetType, 

1539 dataIds=groupForType.keys(), 

1540 run=run, 

1541 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1542 idGenerationMode=idGenerationMode, 

1543 ) 

1544 # Append those resolved DatasetRefs to the new lists we set up for 

1545 # them. 

1546 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1547 resolvedRefs.append(ref) 

1548 

1549 # Go back to the original FileDatasets to replace their refs with the 

1550 # new resolved ones, and also build a big list of all refs. 

1551 allResolvedRefs = [] 

1552 for groupForType in progress.iter_chunks(groupedData.values(), 

1553 desc="Reassociating resolved dataset refs with files"): 

1554 for dataset, resolvedRefs in groupForType.values(): 

1555 dataset.refs = resolvedRefs 

1556 allResolvedRefs.extend(resolvedRefs) 

1557 

1558 # Bulk-insert everything into Datastore. 

1559 self.datastore.ingest(*datasets, transfer=transfer) 

1560 

1561 @contextlib.contextmanager 

1562 def export(self, *, directory: Optional[str] = None, 

1563 filename: Optional[str] = None, 

1564 format: Optional[str] = None, 

1565 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1566 """Export datasets from the repository represented by this `Butler`. 

1567 

1568 This method is a context manager that returns a helper object 

1569 (`RepoExportContext`) that is used to indicate what information from 

1570 the repository should be exported. 

1571 

1572 Parameters 

1573 ---------- 

1574 directory : `str`, optional 

1575 Directory dataset files should be written to if ``transfer`` is not 

1576 `None`. 

1577 filename : `str`, optional 

1578 Name for the file that will include database information associated 

1579 with the exported datasets. If this is not an absolute path and 

1580 ``directory`` is not `None`, it will be written to ``directory`` 

1581 instead of the current working directory. Defaults to 

1582 "export.{format}". 

1583 format : `str`, optional 

1584 File format for the database information file. If `None`, the 

1585 extension of ``filename`` will be used. 

1586 transfer : `str`, optional 

1587 Transfer mode passed to `Datastore.export`. 

1588 

1589 Raises 

1590 ------ 

1591 TypeError 

1592 Raised if the set of arguments passed is inconsistent. 

1593 

1594 Examples 

1595 -------- 

1596 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1597 methods are used to provide the iterables over data IDs and/or datasets 

1598 to be exported:: 

1599 

1600 with butler.export("exports.yaml") as export: 

1601 # Export all flats, but none of the dimension element rows 

1602 # (i.e. data ID information) associated with them. 

1603 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1604 elements=()) 

1605 # Export all datasets that start with "deepCoadd_" and all of 

1606 # their associated data ID information. 

1607 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1608 """ 

1609 if directory is None and transfer is not None: 

1610 raise TypeError("Cannot transfer without providing a directory.") 

1611 if transfer == "move": 

1612 raise TypeError("Transfer may not be 'move': export is read-only") 

1613 if format is None: 

1614 if filename is None: 

1615 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1616 else: 

1617 _, format = os.path.splitext(filename) 

1618 elif filename is None: 

1619 filename = f"export.{format}" 

1620 if directory is not None: 

1621 filename = os.path.join(directory, filename) 

1622 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"]) 

1623 with open(filename, 'w') as stream: 

1624 backend = BackendClass(stream) 

1625 try: 

1626 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1627 directory=directory, transfer=transfer) 

1628 yield helper 

1629 except BaseException: 

1630 raise 

1631 else: 

1632 helper._finish() 

1633 

1634 def import_(self, *, directory: Optional[str] = None, 

1635 filename: Union[str, TextIO, None] = None, 

1636 format: Optional[str] = None, 

1637 transfer: Optional[str] = None, 

1638 skip_dimensions: Optional[Set] = None, 

1639 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1640 reuseIds: bool = False) -> None: 

1641 """Import datasets into this repository that were exported from a 

1642 different butler repository via `~lsst.daf.butler.Butler.export`. 

1643 

1644 Parameters 

1645 ---------- 

1646 directory : `str`, optional 

1647 Directory containing dataset files to import from. If `None`, 

1648 ``filename`` and all dataset file paths specified therein must 

1649 be absolute. 

1650 filename : `str` or `TextIO`, optional 

1651 A stream or name of file that contains database information 

1652 associated with the exported datasets, typically generated by 

1653 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1654 is not an absolute path, does not exist in the current working 

1655 directory, and ``directory`` is not `None`, it is assumed to be in 

1656 ``directory``. Defaults to "export.{format}". 

1657 format : `str`, optional 

1658 File format for ``filename``. If `None`, the extension of 

1659 ``filename`` will be used. 

1660 transfer : `str`, optional 

1661 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1662 skip_dimensions : `set`, optional 

1663 Names of dimensions that should be skipped and not imported. 

1664 idGenerationMode : `DatasetIdGenEnum`, optional 

1665 Specifies option for generating dataset IDs when IDs are not 

1666 provided or their type does not match backend type. By default 

1667 unique IDs are generated for each inserted dataset. 

1668 reuseIds : `bool`, optional 

1669 If `True` then forces re-use of imported dataset IDs for integer 

1670 IDs which are normally generated as auto-incremented; exception 

1671 will be raised if imported IDs clash with existing ones. This 

1672 option has no effect on the use of globally-unique IDs which are 

1673 always re-used (or generated if integer IDs are being imported). 

1674 

1675 Raises 

1676 ------ 

1677 TypeError 

1678 Raised if the set of arguments passed is inconsistent, or if the 

1679 butler is read-only. 

1680 """ 

1681 if not self.isWriteable(): 

1682 raise TypeError("Butler is read-only.") 

1683 if format is None: 

1684 if filename is None: 

1685 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1686 else: 

1687 _, format = os.path.splitext(filename) # type: ignore 

1688 elif filename is None: 

1689 filename = f"export.{format}" 

1690 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1691 filename = os.path.join(directory, filename) 

1692 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"]) 

1693 

1694 def doImport(importStream: TextIO) -> None: 

1695 backend = BackendClass(importStream, self.registry) 

1696 backend.register() 

1697 with self.transaction(): 

1698 backend.load(self.datastore, directory=directory, transfer=transfer, 

1699 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode, 

1700 reuseIds=reuseIds) 

1701 

1702 if isinstance(filename, str): 

1703 with open(filename, "r") as stream: 

1704 doImport(stream) 

1705 else: 

1706 doImport(filename) 

1707 

1708 @transactional 

1709 def transfer_from(self, source_butler: Butler, source_refs: Iterable[DatasetRef], 

1710 transfer: str = "auto", 

1711 id_gen_map: Dict[str, DatasetIdGenEnum] = None) -> List[DatasetRef]: 

1712 """Transfer datasets to this Butler from a run in another Butler. 

1713 

1714 Parameters 

1715 ---------- 

1716 source_butler : `Butler` 

1717 Butler from which the datasets are to be transferred. 

1718 source_refs : iterable of `DatasetRef` 

1719 Datasets defined in the source butler that should be transferred to 

1720 this butler. 

1721 transfer : `str`, optional 

1722 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

1723 id_gen_map : `dict` of [`str`, `DatasetIdGenEnum`], optional 

1724 A mapping of dataset type to ID generation mode. Only used if 

1725 the source butler is using integer IDs. Should not be used 

1726 if this receiving butler uses integer IDs. Without this dataset 

1727 import always uses unique. 

1728 

1729 Returns 

1730 ------- 

1731 refs : `list` of `DatasetRef` 

1732 The refs added to this Butler. 

1733 

1734 Notes 

1735 ----- 

1736 Requires that any dimension definitions are already present in the 

1737 receiving Butler. The datastore artifact has to exist for a transfer 

1738 to be made but non-existence is not an error. 

1739 

1740 Datasets that already exist in this run will be skipped. 

1741 """ 

1742 if not self.isWriteable(): 

1743 raise TypeError("Butler is read-only.") 

1744 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1745 

1746 # Will iterate through the refs multiple times so need to convert 

1747 # to a list if this isn't a collection. 

1748 if not isinstance(source_refs, collections.abc.Collection): 

1749 source_refs = list(source_refs) 

1750 

1751 log.info("Transferring %d datasets into %s", len(source_refs), str(self)) 

1752 

1753 if id_gen_map is None: 

1754 id_gen_map = {} 

1755 

1756 # Importing requires that we group the refs by dataset type and run 

1757 # before doing the import. 

1758 grouped_refs = defaultdict(list) 

1759 grouped_indices = defaultdict(list) 

1760 for i, ref in enumerate(source_refs): 

1761 grouped_refs[ref.datasetType, ref.run].append(ref) 

1762 grouped_indices[ref.datasetType, ref.run].append(i) 

1763 

1764 # The returned refs should be identical for UUIDs. 

1765 # For now must also support integers and so need to retain the 

1766 # newly-created refs from this registry. 

1767 # Pre-size it so we can assign refs into the correct slots 

1768 transferred_refs_tmp: List[Optional[DatasetRef]] = [None] * len(source_refs) 

1769 default_id_gen = DatasetIdGenEnum.UNIQUE 

1770 

1771 for (datasetType, run), refs_to_import in progress.iter_item_chunks(grouped_refs.items(), 

1772 desc="Importing to registry by " 

1773 "run and dataset type"): 

1774 run_doc = source_butler.registry.getCollectionDocumentation(run) 

1775 self.registry.registerCollection(run, CollectionType.RUN, doc=run_doc) 

1776 

1777 id_generation_mode = default_id_gen 

1778 if isinstance(refs_to_import[0].id, int): 

1779 # ID generation mode might need to be overridden when 

1780 # targetting UUID 

1781 id_generation_mode = id_gen_map.get(datasetType.name, default_id_gen) 

1782 

1783 n_refs = len(refs_to_import) 

1784 log.log(VERBOSE, "Importing %d ref%s of dataset type %s into run %s", 

1785 n_refs, "" if n_refs == 1 else "s", datasetType.name, run) 

1786 

1787 # No way to know if this butler's registry uses UUID. 

1788 # We have to trust the caller on this. If it fails they will have 

1789 # to change their approach. We can't catch the exception and 

1790 # retry with unique because that will mess up the transaction 

1791 # handling. We aren't allowed to ask the registry manager what 

1792 # type of ID it is using. 

1793 imported_refs = self.registry._importDatasets(refs_to_import, 

1794 idGenerationMode=id_generation_mode, 

1795 expand=False) 

1796 

1797 # Map them into the correct slots to match the initial order 

1798 for i, ref in zip(grouped_indices[datasetType, run], imported_refs): 

1799 transferred_refs_tmp[i] = ref 

1800 

1801 # Mypy insists that we might have None in here so we have to make 

1802 # that explicit by assigning to a new variable and filtering out 

1803 # something that won't be there. 

1804 transferred_refs = [ref for ref in transferred_refs_tmp if ref is not None] 

1805 

1806 # Check consistency 

1807 assert len(source_refs) == len(transferred_refs), "Different number of refs imported than given" 

1808 

1809 log.log(VERBOSE, "Imported %d datasets into destination butler", len(transferred_refs)) 

1810 

1811 # The transferred refs need to be reordered to match the original 

1812 # ordering given by the caller. Without this the datastore transfer 

1813 # will be broken. 

1814 

1815 # Ask the datastore to transfer. The datastore has to check that 

1816 # the source datastore is compatible with the target datastore. 

1817 self.datastore.transfer_from(source_butler.datastore, source_refs, 

1818 local_refs=transferred_refs, transfer=transfer) 

1819 

1820 return transferred_refs 

1821 

1822 def validateConfiguration(self, logFailures: bool = False, 

1823 datasetTypeNames: Optional[Iterable[str]] = None, 

1824 ignore: Iterable[str] = None) -> None: 

1825 """Validate butler configuration. 

1826 

1827 Checks that each `DatasetType` can be stored in the `Datastore`. 

1828 

1829 Parameters 

1830 ---------- 

1831 logFailures : `bool`, optional 

1832 If `True`, output a log message for every validation error 

1833 detected. 

1834 datasetTypeNames : iterable of `str`, optional 

1835 The `DatasetType` names that should be checked. This allows 

1836 only a subset to be selected. 

1837 ignore : iterable of `str`, optional 

1838 Names of DatasetTypes to skip over. This can be used to skip 

1839 known problems. If a named `DatasetType` corresponds to a 

1840 composite, all components of that `DatasetType` will also be 

1841 ignored. 

1842 

1843 Raises 

1844 ------ 

1845 ButlerValidationError 

1846 Raised if there is some inconsistency with how this Butler 

1847 is configured. 

1848 """ 

1849 if datasetTypeNames: 

1850 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

1851 else: 

1852 datasetTypes = list(self.registry.queryDatasetTypes()) 

1853 

1854 # filter out anything from the ignore list 

1855 if ignore: 

1856 ignore = set(ignore) 

1857 datasetTypes = [e for e in datasetTypes 

1858 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

1859 else: 

1860 ignore = set() 

1861 

1862 # Find all the registered instruments 

1863 instruments = set( 

1864 record.name for record in self.registry.queryDimensionRecords("instrument") 

1865 ) 

1866 

1867 # For each datasetType that has an instrument dimension, create 

1868 # a DatasetRef for each defined instrument 

1869 datasetRefs = [] 

1870 

1871 for datasetType in datasetTypes: 

1872 if "instrument" in datasetType.dimensions: 

1873 for instrument in instruments: 

1874 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

1875 conform=False) 

1876 datasetRefs.append(datasetRef) 

1877 

1878 entities: List[Union[DatasetType, DatasetRef]] = [] 

1879 entities.extend(datasetTypes) 

1880 entities.extend(datasetRefs) 

1881 

1882 datastoreErrorStr = None 

1883 try: 

1884 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

1885 except ValidationError as e: 

1886 datastoreErrorStr = str(e) 

1887 

1888 # Also check that the LookupKeys used by the datastores match 

1889 # registry and storage class definitions 

1890 keys = self.datastore.getLookupKeys() 

1891 

1892 failedNames = set() 

1893 failedDataId = set() 

1894 for key in keys: 

1895 if key.name is not None: 

1896 if key.name in ignore: 

1897 continue 

1898 

1899 # skip if specific datasetType names were requested and this 

1900 # name does not match 

1901 if datasetTypeNames and key.name not in datasetTypeNames: 

1902 continue 

1903 

1904 # See if it is a StorageClass or a DatasetType 

1905 if key.name in self.storageClasses: 

1906 pass 

1907 else: 

1908 try: 

1909 self.registry.getDatasetType(key.name) 

1910 except KeyError: 

1911 if logFailures: 

1912 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

1913 failedNames.add(key) 

1914 else: 

1915 # Dimensions are checked for consistency when the Butler 

1916 # is created and rendezvoused with a universe. 

1917 pass 

1918 

1919 # Check that the instrument is a valid instrument 

1920 # Currently only support instrument so check for that 

1921 if key.dataId: 

1922 dataIdKeys = set(key.dataId) 

1923 if set(["instrument"]) != dataIdKeys: 

1924 if logFailures: 

1925 log.critical("Key '%s' has unsupported DataId override", key) 

1926 failedDataId.add(key) 

1927 elif key.dataId["instrument"] not in instruments: 

1928 if logFailures: 

1929 log.critical("Key '%s' has unknown instrument", key) 

1930 failedDataId.add(key) 

1931 

1932 messages = [] 

1933 

1934 if datastoreErrorStr: 

1935 messages.append(datastoreErrorStr) 

1936 

1937 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

1938 (failedDataId, "Keys with bad DataId entries: ")): 

1939 if failed: 

1940 msg += ", ".join(str(k) for k in failed) 

1941 messages.append(msg) 

1942 

1943 if messages: 

1944 raise ValidationError(";\n".join(messages)) 

1945 

1946 @property 

1947 def collections(self) -> CollectionSearch: 

1948 """The collections to search by default, in order (`CollectionSearch`). 

1949 

1950 This is an alias for ``self.registry.defaults.collections``. It cannot 

1951 be set directly in isolation, but all defaults may be changed together 

1952 by assigning a new `RegistryDefaults` instance to 

1953 ``self.registry.defaults``. 

1954 """ 

1955 return self.registry.defaults.collections 

1956 

1957 @property 

1958 def run(self) -> Optional[str]: 

1959 """Name of the run this butler writes outputs to by default (`str` or 

1960 `None`). 

1961 

1962 This is an alias for ``self.registry.defaults.run``. It cannot be set 

1963 directly in isolation, but all defaults may be changed together by 

1964 assigning a new `RegistryDefaults` instance to 

1965 ``self.registry.defaults``. 

1966 """ 

1967 return self.registry.defaults.run 

1968 

1969 registry: Registry 

1970 """The object that manages dataset metadata and relationships (`Registry`). 

1971 

1972 Most operations that don't involve reading or writing butler datasets are 

1973 accessible only via `Registry` methods. 

1974 """ 

1975 

1976 datastore: Datastore 

1977 """The object that manages actual dataset storage (`Datastore`). 

1978 

1979 Direct user access to the datastore should rarely be necessary; the primary 

1980 exception is the case where a `Datastore` implementation provides extra 

1981 functionality beyond what the base class defines. 

1982 """ 

1983 

1984 storageClasses: StorageClassFactory 

1985 """An object that maps known storage class names to objects that fully 

1986 describe them (`StorageClassFactory`). 

1987 """ 

1988 

1989 _allow_put_of_predefined_dataset: bool 

1990 """Allow a put to succeed even if there is already a registry entry for it 

1991 but not a datastore record. (`bool`)."""