Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30 "PruneCollectionsArgsError", 

31 "PurgeWithoutUnstorePruneCollectionsError", 

32 "RunWithoutPurgePruneCollectionsError", 

33 "PurgeUnsupportedPruneCollectionsError", 

34) 

35 

36 

37from collections import defaultdict 

38import contextlib 

39import logging 

40import numbers 

41import os 

42from typing import ( 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59try: 

60 import boto3 

61except ImportError: 

62 boto3 = None 

63 

64from lsst.utils import doImport 

65from .core import ( 

66 AmbiguousDatasetError, 

67 ButlerURI, 

68 Config, 

69 ConfigSubset, 

70 DataCoordinate, 

71 DataId, 

72 DataIdValue, 

73 DatasetRef, 

74 DatasetType, 

75 Datastore, 

76 Dimension, 

77 DimensionConfig, 

78 FileDataset, 

79 Progress, 

80 StorageClassFactory, 

81 Timespan, 

82 ValidationError, 

83) 

84from .core.repoRelocation import BUTLER_ROOT_TAG 

85from .core.utils import transactional, getClassOf 

86from ._deferredDatasetHandle import DeferredDatasetHandle 

87from ._butlerConfig import ButlerConfig 

88from .registry import ( 

89 Registry, 

90 RegistryConfig, 

91 RegistryDefaults, 

92 CollectionSearch, 

93 CollectionType, 

94 ConflictingDefinitionError, 

95 DatasetIdGenEnum, 

96) 

97from .transfers import RepoExportContext 

98 

99log = logging.getLogger(__name__) 

100 

101 

102class ButlerValidationError(ValidationError): 

103 """There is a problem with the Butler configuration.""" 

104 pass 

105 

106 

107class PruneCollectionsArgsError(TypeError): 

108 """Base class for errors relating to Butler.pruneCollections input 

109 arguments. 

110 """ 

111 pass 

112 

113 

114class PurgeWithoutUnstorePruneCollectionsError(PruneCollectionsArgsError): 

115 """Raised when purge and unstore are both required to be True, and 

116 purge is True but unstore is False. 

117 """ 

118 

119 def __init__(self) -> None: 

120 super().__init__("Cannot pass purge=True without unstore=True.") 

121 

122 

123class RunWithoutPurgePruneCollectionsError(PruneCollectionsArgsError): 

124 """Raised when pruning a RUN collection but purge is False.""" 

125 

126 def __init__(self, collectionType: CollectionType): 

127 self.collectionType = collectionType 

128 super().__init__(f"Cannot prune RUN collection {self.collectionType.name} without purge=True.") 

129 

130 

131class PurgeUnsupportedPruneCollectionsError(PruneCollectionsArgsError): 

132 """Raised when purge is True but is not supported for the given 

133 collection.""" 

134 

135 def __init__(self, collectionType: CollectionType): 

136 self.collectionType = collectionType 

137 super().__init__( 

138 f"Cannot prune {self.collectionType} collection {self.collectionType.name} with purge=True.") 

139 

140 

141class Butler: 

142 """Main entry point for the data access system. 

143 

144 Parameters 

145 ---------- 

146 config : `ButlerConfig`, `Config` or `str`, optional. 

147 Configuration. Anything acceptable to the 

148 `ButlerConfig` constructor. If a directory path 

149 is given the configuration will be read from a ``butler.yaml`` file in 

150 that location. If `None` is given default values will be used. 

151 butler : `Butler`, optional. 

152 If provided, construct a new Butler that uses the same registry and 

153 datastore as the given one, but with the given collection and run. 

154 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

155 arguments. 

156 collections : `str` or `Iterable` [ `str` ], optional 

157 An expression specifying the collections to be searched (in order) when 

158 reading datasets. 

159 This may be a `str` collection name or an iterable thereof. 

160 See :ref:`daf_butler_collection_expressions` for more information. 

161 These collections are not registered automatically and must be 

162 manually registered before they are used by any method, but they may be 

163 manually registered after the `Butler` is initialized. 

164 run : `str`, optional 

165 Name of the `~CollectionType.RUN` collection new datasets should be 

166 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

167 ``collections`` will be set to ``[run]``. If not `None`, this 

168 collection will automatically be registered. If this is not set (and 

169 ``writeable`` is not set either), a read-only butler will be created. 

170 searchPaths : `list` of `str`, optional 

171 Directory paths to search when calculating the full Butler 

172 configuration. Not used if the supplied config is already a 

173 `ButlerConfig`. 

174 writeable : `bool`, optional 

175 Explicitly sets whether the butler supports write operations. If not 

176 provided, a read-write butler is created if any of ``run``, ``tags``, 

177 or ``chains`` is non-empty. 

178 inferDefaults : `bool`, optional 

179 If `True` (default) infer default data ID values from the values 

180 present in the datasets in ``collections``: if all collections have the 

181 same value (or no value) for a governor dimension, that value will be 

182 the default for that dimension. Nonexistent collections are ignored. 

183 If a default value is provided explicitly for a governor dimension via 

184 ``**kwargs``, no default will be inferred for that dimension. 

185 **kwargs : `str` 

186 Default data ID key-value pairs. These may only identify "governor" 

187 dimensions like ``instrument`` and ``skymap``. 

188 

189 Examples 

190 -------- 

191 While there are many ways to control exactly how a `Butler` interacts with 

192 the collections in its `Registry`, the most common cases are still simple. 

193 

194 For a read-only `Butler` that searches one collection, do:: 

195 

196 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

197 

198 For a read-write `Butler` that writes to and reads from a 

199 `~CollectionType.RUN` collection:: 

200 

201 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

202 

203 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

204 because we want to write to one `~CollectionType.RUN` collection but read 

205 from several others (as well):: 

206 

207 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

208 collections=["u/alice/DM-50000/a", 

209 "u/bob/DM-49998", 

210 "HSC/defaults"]) 

211 

212 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

213 Datasets will be read first from that run (since it appears first in the 

214 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

215 

216 Finally, one can always create a `Butler` with no collections:: 

217 

218 butler = Butler("/path/to/repo", writeable=True) 

219 

220 This can be extremely useful when you just want to use ``butler.registry``, 

221 e.g. for inserting dimension data or managing collections, or when the 

222 collections you want to use with the butler are not consistent. 

223 Passing ``writeable`` explicitly here is only necessary if you want to be 

224 able to make changes to the repo - usually the value for ``writeable`` can 

225 be guessed from the collection arguments provided, but it defaults to 

226 `False` when there are not collection arguments. 

227 """ 

228 def __init__(self, config: Union[Config, str, None] = None, *, 

229 butler: Optional[Butler] = None, 

230 collections: Any = None, 

231 run: Optional[str] = None, 

232 searchPaths: Optional[List[str]] = None, 

233 writeable: Optional[bool] = None, 

234 inferDefaults: bool = True, 

235 **kwargs: str, 

236 ): 

237 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

238 # Load registry, datastore, etc. from config or existing butler. 

239 if butler is not None: 

240 if config is not None or searchPaths is not None or writeable is not None: 

241 raise TypeError("Cannot pass 'config', 'searchPaths', or 'writeable' " 

242 "arguments with 'butler' argument.") 

243 self.registry = butler.registry.copy(defaults) 

244 self.datastore = butler.datastore 

245 self.storageClasses = butler.storageClasses 

246 self._config: ButlerConfig = butler._config 

247 else: 

248 self._config = ButlerConfig(config, searchPaths=searchPaths) 

249 if "root" in self._config: 

250 butlerRoot = self._config["root"] 

251 else: 

252 butlerRoot = self._config.configDir 

253 if writeable is None: 

254 writeable = run is not None 

255 self.registry = Registry.fromConfig(self._config, butlerRoot=butlerRoot, writeable=writeable, 

256 defaults=defaults) 

257 self.datastore = Datastore.fromConfig(self._config, self.registry.getDatastoreBridgeManager(), 

258 butlerRoot=butlerRoot) 

259 self.storageClasses = StorageClassFactory() 

260 self.storageClasses.addFromConfig(self._config) 

261 if "run" in self._config or "collection" in self._config: 

262 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

263 

264 GENERATION: ClassVar[int] = 3 

265 """This is a Generation 3 Butler. 

266 

267 This attribute may be removed in the future, once the Generation 2 Butler 

268 interface has been fully retired; it should only be used in transitional 

269 code. 

270 """ 

271 

272 @staticmethod 

273 def makeRepo(root: str, config: Union[Config, str, None] = None, 

274 dimensionConfig: Union[Config, str, None] = None, standalone: bool = False, 

275 searchPaths: Optional[List[str]] = None, forceConfigRoot: bool = True, 

276 outfile: Optional[str] = None, overwrite: bool = False) -> Config: 

277 """Create an empty data repository by adding a butler.yaml config 

278 to a repository root directory. 

279 

280 Parameters 

281 ---------- 

282 root : `str` or `ButlerURI` 

283 Path or URI to the root location of the new repository. Will be 

284 created if it does not exist. 

285 config : `Config` or `str`, optional 

286 Configuration to write to the repository, after setting any 

287 root-dependent Registry or Datastore config options. Can not 

288 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

289 configuration will be used. Root-dependent config options 

290 specified in this config are overwritten if ``forceConfigRoot`` 

291 is `True`. 

292 dimensionConfig : `Config` or `str`, optional 

293 Configuration for dimensions, will be used to initialize registry 

294 database. 

295 standalone : `bool` 

296 If True, write all expanded defaults, not just customized or 

297 repository-specific settings. 

298 This (mostly) decouples the repository from the default 

299 configuration, insulating it from changes to the defaults (which 

300 may be good or bad, depending on the nature of the changes). 

301 Future *additions* to the defaults will still be picked up when 

302 initializing `Butlers` to repos created with ``standalone=True``. 

303 searchPaths : `list` of `str`, optional 

304 Directory paths to search when calculating the full butler 

305 configuration. 

306 forceConfigRoot : `bool`, optional 

307 If `False`, any values present in the supplied ``config`` that 

308 would normally be reset are not overridden and will appear 

309 directly in the output config. This allows non-standard overrides 

310 of the root directory for a datastore or registry to be given. 

311 If this parameter is `True` the values for ``root`` will be 

312 forced into the resulting config if appropriate. 

313 outfile : `str`, optional 

314 If not-`None`, the output configuration will be written to this 

315 location rather than into the repository itself. Can be a URI 

316 string. Can refer to a directory that will be used to write 

317 ``butler.yaml``. 

318 overwrite : `bool`, optional 

319 Create a new configuration file even if one already exists 

320 in the specified output location. Default is to raise 

321 an exception. 

322 

323 Returns 

324 ------- 

325 config : `Config` 

326 The updated `Config` instance written to the repo. 

327 

328 Raises 

329 ------ 

330 ValueError 

331 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

332 regular Config (as these subclasses would make it impossible to 

333 support ``standalone=False``). 

334 FileExistsError 

335 Raised if the output config file already exists. 

336 os.error 

337 Raised if the directory does not exist, exists but is not a 

338 directory, or cannot be created. 

339 

340 Notes 

341 ----- 

342 Note that when ``standalone=False`` (the default), the configuration 

343 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

344 construct the repository should also be used to construct any Butlers 

345 to avoid configuration inconsistencies. 

346 """ 

347 if isinstance(config, (ButlerConfig, ConfigSubset)): 

348 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

349 

350 # Ensure that the root of the repository exists or can be made 

351 uri = ButlerURI(root, forceDirectory=True) 

352 uri.mkdir() 

353 

354 config = Config(config) 

355 

356 # If we are creating a new repo from scratch with relative roots, 

357 # do not propagate an explicit root from the config file 

358 if "root" in config: 

359 del config["root"] 

360 

361 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

362 datastoreClass: Type[Datastore] = doImport(full["datastore", "cls"]) 

363 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

364 

365 # if key exists in given config, parse it, otherwise parse the defaults 

366 # in the expanded config 

367 if config.get(("registry", "db")): 

368 registryConfig = RegistryConfig(config) 

369 else: 

370 registryConfig = RegistryConfig(full) 

371 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

372 if defaultDatabaseUri is not None: 

373 Config.updateParameters(RegistryConfig, config, full, 

374 toUpdate={"db": defaultDatabaseUri}, 

375 overwrite=forceConfigRoot) 

376 else: 

377 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), 

378 overwrite=forceConfigRoot) 

379 

380 if standalone: 

381 config.merge(full) 

382 else: 

383 # Always expand the registry.managers section into the per-repo 

384 # config, because after the database schema is created, it's not 

385 # allowed to change anymore. Note that in the standalone=True 

386 # branch, _everything_ in the config is expanded, so there's no 

387 # need to special case this. 

388 Config.updateParameters(RegistryConfig, config, full, toCopy=("managers",), overwrite=False) 

389 configURI: Union[str, ButlerURI] 

390 if outfile is not None: 

391 # When writing to a separate location we must include 

392 # the root of the butler repo in the config else it won't know 

393 # where to look. 

394 config["root"] = uri.geturl() 

395 configURI = outfile 

396 else: 

397 configURI = uri 

398 config.dumpToUri(configURI, overwrite=overwrite) 

399 

400 # Create Registry and populate tables 

401 registryConfig = RegistryConfig(config.get("registry")) 

402 dimensionConfig = DimensionConfig(dimensionConfig) 

403 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root) 

404 

405 return config 

406 

407 @classmethod 

408 def _unpickle(cls, config: ButlerConfig, collections: Optional[CollectionSearch], run: Optional[str], 

409 defaultDataId: Dict[str, str], writeable: bool) -> Butler: 

410 """Callable used to unpickle a Butler. 

411 

412 We prefer not to use ``Butler.__init__`` directly so we can force some 

413 of its many arguments to be keyword-only (note that ``__reduce__`` 

414 can only invoke callables with positional arguments). 

415 

416 Parameters 

417 ---------- 

418 config : `ButlerConfig` 

419 Butler configuration, already coerced into a true `ButlerConfig` 

420 instance (and hence after any search paths for overrides have been 

421 utilized). 

422 collections : `CollectionSearch` 

423 Names of the default collections to read from. 

424 run : `str`, optional 

425 Name of the default `~CollectionType.RUN` collection to write to. 

426 defaultDataId : `dict` [ `str`, `str` ] 

427 Default data ID values. 

428 writeable : `bool` 

429 Whether the Butler should support write operations. 

430 

431 Returns 

432 ------- 

433 butler : `Butler` 

434 A new `Butler` instance. 

435 """ 

436 # MyPy doesn't recognize that the kwargs below are totally valid; it 

437 # seems to think '**defaultDataId* is a _positional_ argument! 

438 return cls(config=config, collections=collections, run=run, writeable=writeable, 

439 **defaultDataId) # type: ignore 

440 

441 def __reduce__(self) -> tuple: 

442 """Support pickling. 

443 """ 

444 return (Butler._unpickle, (self._config, self.collections, self.run, 

445 self.registry.defaults.dataId.byName(), 

446 self.registry.isWriteable())) 

447 

448 def __str__(self) -> str: 

449 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

450 self.collections, self.run, self.datastore, self.registry) 

451 

452 def isWriteable(self) -> bool: 

453 """Return `True` if this `Butler` supports write operations. 

454 """ 

455 return self.registry.isWriteable() 

456 

457 @contextlib.contextmanager 

458 def transaction(self) -> Iterator[None]: 

459 """Context manager supporting `Butler` transactions. 

460 

461 Transactions can be nested. 

462 """ 

463 with self.registry.transaction(): 

464 with self.datastore.transaction(): 

465 yield 

466 

467 def _standardizeArgs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

468 dataId: Optional[DataId] = None, **kwds: Any 

469 ) -> Tuple[DatasetType, Optional[DataId]]: 

470 """Standardize the arguments passed to several Butler APIs. 

471 

472 Parameters 

473 ---------- 

474 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

475 When `DatasetRef` the `dataId` should be `None`. 

476 Otherwise the `DatasetType` or name thereof. 

477 dataId : `dict` or `DataCoordinate` 

478 A `dict` of `Dimension` link name, value pairs that label the 

479 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

480 should be provided as the second argument. 

481 kwds 

482 Additional keyword arguments used to augment or construct a 

483 `DataCoordinate`. See `DataCoordinate.standardize` 

484 parameters. 

485 

486 Returns 

487 ------- 

488 datasetType : `DatasetType` 

489 A `DatasetType` instance extracted from ``datasetRefOrType``. 

490 dataId : `dict` or `DataId`, optional 

491 Argument that can be used (along with ``kwds``) to construct a 

492 `DataId`. 

493 

494 Notes 

495 ----- 

496 Butler APIs that conceptually need a DatasetRef also allow passing a 

497 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

498 keyword arguments that can be used to construct one) separately. This 

499 method accepts those arguments and always returns a true `DatasetType` 

500 and a `DataId` or `dict`. 

501 

502 Standardization of `dict` vs `DataId` is best handled by passing the 

503 returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are 

504 generally similarly flexible. 

505 """ 

506 externalDatasetType: Optional[DatasetType] = None 

507 internalDatasetType: Optional[DatasetType] = None 

508 if isinstance(datasetRefOrType, DatasetRef): 

509 if dataId is not None or kwds: 

510 raise ValueError("DatasetRef given, cannot use dataId as well") 

511 externalDatasetType = datasetRefOrType.datasetType 

512 dataId = datasetRefOrType.dataId 

513 else: 

514 # Don't check whether DataId is provided, because Registry APIs 

515 # can usually construct a better error message when it wasn't. 

516 if isinstance(datasetRefOrType, DatasetType): 

517 externalDatasetType = datasetRefOrType 

518 else: 

519 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

520 

521 # Check that they are self-consistent 

522 if externalDatasetType is not None: 

523 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

524 if externalDatasetType != internalDatasetType: 

525 raise ValueError(f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

526 f"registry definition ({internalDatasetType})") 

527 

528 assert internalDatasetType is not None 

529 return internalDatasetType, dataId 

530 

531 def _findDatasetRef(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

532 dataId: Optional[DataId] = None, *, 

533 collections: Any = None, 

534 allowUnresolved: bool = False, 

535 **kwds: Any) -> DatasetRef: 

536 """Shared logic for methods that start with a search for a dataset in 

537 the registry. 

538 

539 Parameters 

540 ---------- 

541 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

542 When `DatasetRef` the `dataId` should be `None`. 

543 Otherwise the `DatasetType` or name thereof. 

544 dataId : `dict` or `DataCoordinate`, optional 

545 A `dict` of `Dimension` link name, value pairs that label the 

546 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

547 should be provided as the first argument. 

548 collections : Any, optional 

549 Collections to be searched, overriding ``self.collections``. 

550 Can be any of the types supported by the ``collections`` argument 

551 to butler construction. 

552 allowUnresolved : `bool`, optional 

553 If `True`, return an unresolved `DatasetRef` if finding a resolved 

554 one in the `Registry` fails. Defaults to `False`. 

555 kwds 

556 Additional keyword arguments used to augment or construct a 

557 `DataId`. See `DataId` parameters. 

558 

559 Returns 

560 ------- 

561 ref : `DatasetRef` 

562 A reference to the dataset identified by the given arguments. 

563 

564 Raises 

565 ------ 

566 LookupError 

567 Raised if no matching dataset exists in the `Registry` (and 

568 ``allowUnresolved is False``). 

569 ValueError 

570 Raised if a resolved `DatasetRef` was passed as an input, but it 

571 differs from the one found in the registry. 

572 TypeError 

573 Raised if no collections were provided. 

574 """ 

575 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

576 if isinstance(datasetRefOrType, DatasetRef): 

577 idNumber = datasetRefOrType.id 

578 else: 

579 idNumber = None 

580 timespan: Optional[Timespan] = None 

581 

582 # Process dimension records that are using record information 

583 # rather than ids 

584 newDataId: Dict[str, DataIdValue] = {} 

585 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

586 

587 # if all the dataId comes from keyword parameters we do not need 

588 # to do anything here because they can't be of the form 

589 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

590 if dataId: 

591 for k, v in dataId.items(): 

592 # If we have a Dimension we do not need to do anything 

593 # because it cannot be a compound key. 

594 if isinstance(k, str) and "." in k: 

595 # Someone is using a more human-readable dataId 

596 dimensionName, record = k.split(".", 1) 

597 byRecord[dimensionName][record] = v 

598 elif isinstance(k, Dimension): 

599 newDataId[k.name] = v 

600 else: 

601 newDataId[k] = v 

602 

603 # Go through the updated dataId and check the type in case someone is 

604 # using an alternate key. We have already filtered out the compound 

605 # keys dimensions.record format. 

606 not_dimensions = {} 

607 

608 # Will need to look in the dataId and the keyword arguments 

609 # and will remove them if they need to be fixed or are unrecognized. 

610 for dataIdDict in (newDataId, kwds): 

611 # Use a list so we can adjust the dict safely in the loop 

612 for dimensionName in list(dataIdDict): 

613 value = dataIdDict[dimensionName] 

614 try: 

615 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

616 except KeyError: 

617 # This is not a real dimension 

618 not_dimensions[dimensionName] = value 

619 del dataIdDict[dimensionName] 

620 continue 

621 

622 # Convert an integral type to an explicit int to simplify 

623 # comparisons here 

624 if isinstance(value, numbers.Integral): 

625 value = int(value) 

626 

627 if not isinstance(value, dimension.primaryKey.getPythonType()): 

628 for alternate in dimension.alternateKeys: 

629 if isinstance(value, alternate.getPythonType()): 

630 byRecord[dimensionName][alternate.name] = value 

631 del dataIdDict[dimensionName] 

632 log.debug("Converting dimension %s to %s.%s=%s", 

633 dimensionName, dimensionName, alternate.name, value) 

634 break 

635 else: 

636 log.warning("Type mismatch found for value '%r' provided for dimension %s. " 

637 "Could not find matching alternative (primary key has type %s) " 

638 "so attempting to use as-is.", 

639 value, dimensionName, dimension.primaryKey.getPythonType()) 

640 

641 # If we have some unrecognized dimensions we have to try to connect 

642 # them to records in other dimensions. This is made more complicated 

643 # by some dimensions having records with clashing names. A mitigation 

644 # is that we can tell by this point which dimensions are missing 

645 # for the DatasetType but this does not work for calibrations 

646 # where additional dimensions can be used to constrain the temporal 

647 # axis. 

648 if not_dimensions: 

649 # Calculate missing dimensions 

650 provided = set(newDataId) | set(kwds) | set(byRecord) 

651 missingDimensions = datasetType.dimensions.names - provided 

652 

653 # For calibrations we may well be needing temporal dimensions 

654 # so rather than always including all dimensions in the scan 

655 # restrict things a little. It is still possible for there 

656 # to be confusion over day_obs in visit vs exposure for example. 

657 # If we are not searching calibration collections things may 

658 # fail but they are going to fail anyway because of the 

659 # ambiguousness of the dataId... 

660 candidateDimensions: Set[str] = set() 

661 candidateDimensions.update(missingDimensions) 

662 if datasetType.isCalibration(): 

663 for dim in self.registry.dimensions.getStaticDimensions(): 

664 if dim.temporal: 

665 candidateDimensions.add(str(dim)) 

666 

667 # Look up table for the first association with a dimension 

668 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

669 

670 # Keep track of whether an item is associated with multiple 

671 # dimensions. 

672 counter: Counter[str] = Counter() 

673 assigned: Dict[str, Set[str]] = defaultdict(set) 

674 

675 # Go through the missing dimensions and associate the 

676 # given names with records within those dimensions 

677 for dimensionName in candidateDimensions: 

678 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

679 fields = dimension.metadata.names | dimension.uniqueKeys.names 

680 for field in not_dimensions: 

681 if field in fields: 

682 guessedAssociation[dimensionName][field] = not_dimensions[field] 

683 counter[dimensionName] += 1 

684 assigned[field].add(dimensionName) 

685 

686 # There is a chance we have allocated a single dataId item 

687 # to multiple dimensions. Need to decide which should be retained. 

688 # For now assume that the most popular alternative wins. 

689 # This means that day_obs with seq_num will result in 

690 # exposure.day_obs and not visit.day_obs 

691 # Also prefer an explicitly missing dimension over an inferred 

692 # temporal dimension. 

693 for fieldName, assignedDimensions in assigned.items(): 

694 if len(assignedDimensions) > 1: 

695 # Pick the most popular (preferring mandatory dimensions) 

696 requiredButMissing = assignedDimensions.intersection(missingDimensions) 

697 if requiredButMissing: 

698 candidateDimensions = requiredButMissing 

699 else: 

700 candidateDimensions = assignedDimensions 

701 

702 # Select the relevant items and get a new restricted 

703 # counter. 

704 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

705 duplicatesCounter: Counter[str] = Counter() 

706 duplicatesCounter.update(theseCounts) 

707 

708 # Choose the most common. If they are equally common 

709 # we will pick the one that was found first. 

710 # Returns a list of tuples 

711 selected = duplicatesCounter.most_common(1)[0][0] 

712 

713 log.debug("Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

714 " Removed ambiguity by choosing dimension %s.", 

715 fieldName, ", ".join(assignedDimensions), selected) 

716 

717 for candidateDimension in assignedDimensions: 

718 if candidateDimension != selected: 

719 del guessedAssociation[candidateDimension][fieldName] 

720 

721 # Update the record look up dict with the new associations 

722 for dimensionName, values in guessedAssociation.items(): 

723 if values: # A dict might now be empty 

724 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", 

725 dimensionName, values) 

726 byRecord[dimensionName].update(values) 

727 

728 if byRecord: 

729 # Some record specifiers were found so we need to convert 

730 # them to the Id form 

731 for dimensionName, values in byRecord.items(): 

732 if dimensionName in newDataId: 

733 log.warning("DataId specified explicit %s dimension value of %s in addition to" 

734 " general record specifiers for it of %s. Ignoring record information.", 

735 dimensionName, newDataId[dimensionName], str(values)) 

736 continue 

737 

738 # Build up a WHERE expression -- use single quotes 

739 def quote(s: Any) -> str: 

740 if isinstance(s, str): 

741 return f"'{s}'" 

742 else: 

743 return s 

744 

745 where = " AND ".join(f"{dimensionName}.{k} = {quote(v)}" 

746 for k, v in values.items()) 

747 

748 # Hopefully we get a single record that matches 

749 records = set(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId, 

750 where=where, **kwds)) 

751 

752 if len(records) != 1: 

753 if len(records) > 1: 

754 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

755 for r in records: 

756 log.debug("- %s", str(r)) 

757 raise RuntimeError(f"DataId specification for dimension {dimensionName} is not" 

758 f" uniquely constrained to a single dataset by {values}." 

759 f" Got {len(records)} results.") 

760 raise RuntimeError(f"DataId specification for dimension {dimensionName} matched no" 

761 f" records when constrained by {values}") 

762 

763 # Get the primary key from the real dimension object 

764 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

765 if not isinstance(dimension, Dimension): 

766 raise RuntimeError( 

767 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

768 ) 

769 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

770 

771 # We have modified the dataId so need to switch to it 

772 dataId = newDataId 

773 

774 if datasetType.isCalibration(): 

775 # Because this is a calibration dataset, first try to make a 

776 # standardize the data ID without restricting the dimensions to 

777 # those of the dataset type requested, because there may be extra 

778 # dimensions that provide temporal information for a validity-range 

779 # lookup. 

780 dataId = DataCoordinate.standardize(dataId, universe=self.registry.dimensions, 

781 defaults=self.registry.defaults.dataId, **kwds) 

782 if dataId.graph.temporal: 

783 dataId = self.registry.expandDataId(dataId) 

784 timespan = dataId.timespan 

785 else: 

786 # Standardize the data ID to just the dimensions of the dataset 

787 # type instead of letting registry.findDataset do it, so we get the 

788 # result even if no dataset is found. 

789 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

790 defaults=self.registry.defaults.dataId, **kwds) 

791 # Always lookup the DatasetRef, even if one is given, to ensure it is 

792 # present in the current collection. 

793 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

794 if ref is None: 

795 if allowUnresolved: 

796 return DatasetRef(datasetType, dataId) 

797 else: 

798 if collections is None: 

799 collections = self.registry.defaults.collections 

800 raise LookupError(f"Dataset {datasetType.name} with data ID {dataId} " 

801 f"could not be found in collections {collections}.") 

802 if idNumber is not None and idNumber != ref.id: 

803 if collections is None: 

804 collections = self.registry.defaults.collections 

805 raise ValueError(f"DatasetRef.id provided ({idNumber}) does not match " 

806 f"id ({ref.id}) in registry in collections {collections}.") 

807 return ref 

808 

809 @transactional 

810 def put(self, obj: Any, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

811 dataId: Optional[DataId] = None, *, 

812 run: Optional[str] = None, 

813 **kwds: Any) -> DatasetRef: 

814 """Store and register a dataset. 

815 

816 Parameters 

817 ---------- 

818 obj : `object` 

819 The dataset. 

820 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

821 When `DatasetRef` is provided, ``dataId`` should be `None`. 

822 Otherwise the `DatasetType` or name thereof. 

823 dataId : `dict` or `DataCoordinate` 

824 A `dict` of `Dimension` link name, value pairs that label the 

825 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

826 should be provided as the second argument. 

827 run : `str`, optional 

828 The name of the run the dataset should be added to, overriding 

829 ``self.run``. 

830 kwds 

831 Additional keyword arguments used to augment or construct a 

832 `DataCoordinate`. See `DataCoordinate.standardize` 

833 parameters. 

834 

835 Returns 

836 ------- 

837 ref : `DatasetRef` 

838 A reference to the stored dataset, updated with the correct id if 

839 given. 

840 

841 Raises 

842 ------ 

843 TypeError 

844 Raised if the butler is read-only or if no run has been provided. 

845 """ 

846 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

847 if not self.isWriteable(): 

848 raise TypeError("Butler is read-only.") 

849 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

850 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

851 raise ValueError("DatasetRef must not be in registry, must have None id") 

852 

853 # Add Registry Dataset entry. 

854 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwds) 

855 ref, = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

856 

857 # Add Datastore entry. 

858 self.datastore.put(obj, ref) 

859 

860 return ref 

861 

862 def getDirect(self, ref: DatasetRef, *, parameters: Optional[Dict[str, Any]] = None) -> Any: 

863 """Retrieve a stored dataset. 

864 

865 Unlike `Butler.get`, this method allows datasets outside the Butler's 

866 collection to be read as long as the `DatasetRef` that identifies them 

867 can be obtained separately. 

868 

869 Parameters 

870 ---------- 

871 ref : `DatasetRef` 

872 Resolved reference to an already stored dataset. 

873 parameters : `dict` 

874 Additional StorageClass-defined options to control reading, 

875 typically used to efficiently read only a subset of the dataset. 

876 

877 Returns 

878 ------- 

879 obj : `object` 

880 The dataset. 

881 """ 

882 return self.datastore.get(ref, parameters=parameters) 

883 

884 def getDirectDeferred(self, ref: DatasetRef, *, 

885 parameters: Union[dict, None] = None) -> DeferredDatasetHandle: 

886 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

887 from a resolved `DatasetRef`. 

888 

889 Parameters 

890 ---------- 

891 ref : `DatasetRef` 

892 Resolved reference to an already stored dataset. 

893 parameters : `dict` 

894 Additional StorageClass-defined options to control reading, 

895 typically used to efficiently read only a subset of the dataset. 

896 

897 Returns 

898 ------- 

899 obj : `DeferredDatasetHandle` 

900 A handle which can be used to retrieve a dataset at a later time. 

901 

902 Raises 

903 ------ 

904 AmbiguousDatasetError 

905 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

906 """ 

907 if ref.id is None: 

908 raise AmbiguousDatasetError( 

909 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

910 ) 

911 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

912 

913 def getDeferred(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

914 dataId: Optional[DataId] = None, *, 

915 parameters: Union[dict, None] = None, 

916 collections: Any = None, 

917 **kwds: Any) -> DeferredDatasetHandle: 

918 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

919 after an immediate registry lookup. 

920 

921 Parameters 

922 ---------- 

923 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

924 When `DatasetRef` the `dataId` should be `None`. 

925 Otherwise the `DatasetType` or name thereof. 

926 dataId : `dict` or `DataCoordinate`, optional 

927 A `dict` of `Dimension` link name, value pairs that label the 

928 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

929 should be provided as the first argument. 

930 parameters : `dict` 

931 Additional StorageClass-defined options to control reading, 

932 typically used to efficiently read only a subset of the dataset. 

933 collections : Any, optional 

934 Collections to be searched, overriding ``self.collections``. 

935 Can be any of the types supported by the ``collections`` argument 

936 to butler construction. 

937 kwds 

938 Additional keyword arguments used to augment or construct a 

939 `DataId`. See `DataId` parameters. 

940 

941 Returns 

942 ------- 

943 obj : `DeferredDatasetHandle` 

944 A handle which can be used to retrieve a dataset at a later time. 

945 

946 Raises 

947 ------ 

948 LookupError 

949 Raised if no matching dataset exists in the `Registry` (and 

950 ``allowUnresolved is False``). 

951 ValueError 

952 Raised if a resolved `DatasetRef` was passed as an input, but it 

953 differs from the one found in the registry. 

954 TypeError 

955 Raised if no collections were provided. 

956 """ 

957 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

958 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters) 

959 

960 def get(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

961 dataId: Optional[DataId] = None, *, 

962 parameters: Optional[Dict[str, Any]] = None, 

963 collections: Any = None, 

964 **kwds: Any) -> Any: 

965 """Retrieve a stored dataset. 

966 

967 Parameters 

968 ---------- 

969 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

970 When `DatasetRef` the `dataId` should be `None`. 

971 Otherwise the `DatasetType` or name thereof. 

972 dataId : `dict` or `DataCoordinate` 

973 A `dict` of `Dimension` link name, value pairs that label the 

974 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

975 should be provided as the first argument. 

976 parameters : `dict` 

977 Additional StorageClass-defined options to control reading, 

978 typically used to efficiently read only a subset of the dataset. 

979 collections : Any, optional 

980 Collections to be searched, overriding ``self.collections``. 

981 Can be any of the types supported by the ``collections`` argument 

982 to butler construction. 

983 kwds 

984 Additional keyword arguments used to augment or construct a 

985 `DataCoordinate`. See `DataCoordinate.standardize` 

986 parameters. 

987 

988 Returns 

989 ------- 

990 obj : `object` 

991 The dataset. 

992 

993 Raises 

994 ------ 

995 ValueError 

996 Raised if a resolved `DatasetRef` was passed as an input, but it 

997 differs from the one found in the registry. 

998 LookupError 

999 Raised if no matching dataset exists in the `Registry`. 

1000 TypeError 

1001 Raised if no collections were provided. 

1002 

1003 Notes 

1004 ----- 

1005 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1006 this method requires that the given data ID include temporal dimensions 

1007 beyond the dimensions of the dataset type itself, in order to find the 

1008 dataset with the appropriate validity range. For example, a "bias" 

1009 dataset with native dimensions ``{instrument, detector}`` could be 

1010 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1011 ``exposure`` is a temporal dimension. 

1012 """ 

1013 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1014 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1015 return self.getDirect(ref, parameters=parameters) 

1016 

1017 def getURIs(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1018 dataId: Optional[DataId] = None, *, 

1019 predict: bool = False, 

1020 collections: Any = None, 

1021 run: Optional[str] = None, 

1022 **kwds: Any) -> Tuple[Optional[ButlerURI], Dict[str, ButlerURI]]: 

1023 """Returns the URIs associated with the dataset. 

1024 

1025 Parameters 

1026 ---------- 

1027 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1028 When `DatasetRef` the `dataId` should be `None`. 

1029 Otherwise the `DatasetType` or name thereof. 

1030 dataId : `dict` or `DataCoordinate` 

1031 A `dict` of `Dimension` link name, value pairs that label the 

1032 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1033 should be provided as the first argument. 

1034 predict : `bool` 

1035 If `True`, allow URIs to be returned of datasets that have not 

1036 been written. 

1037 collections : Any, optional 

1038 Collections to be searched, overriding ``self.collections``. 

1039 Can be any of the types supported by the ``collections`` argument 

1040 to butler construction. 

1041 run : `str`, optional 

1042 Run to use for predictions, overriding ``self.run``. 

1043 kwds 

1044 Additional keyword arguments used to augment or construct a 

1045 `DataCoordinate`. See `DataCoordinate.standardize` 

1046 parameters. 

1047 

1048 Returns 

1049 ------- 

1050 primary : `ButlerURI` 

1051 The URI to the primary artifact associated with this dataset. 

1052 If the dataset was disassembled within the datastore this 

1053 may be `None`. 

1054 components : `dict` 

1055 URIs to any components associated with the dataset artifact. 

1056 Can be empty if there are no components. 

1057 """ 

1058 ref = self._findDatasetRef(datasetRefOrType, dataId, allowUnresolved=predict, 

1059 collections=collections, **kwds) 

1060 if ref.id is None: # only possible if predict is True 

1061 if run is None: 

1062 run = self.run 

1063 if run is None: 

1064 raise TypeError("Cannot predict location with run=None.") 

1065 # Lie about ID, because we can't guess it, and only 

1066 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1067 ref = ref.resolved(id=0, run=run) 

1068 return self.datastore.getURIs(ref, predict) 

1069 

1070 def getURI(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1071 dataId: Optional[DataId] = None, *, 

1072 predict: bool = False, 

1073 collections: Any = None, 

1074 run: Optional[str] = None, 

1075 **kwds: Any) -> ButlerURI: 

1076 """Return the URI to the Dataset. 

1077 

1078 Parameters 

1079 ---------- 

1080 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1081 When `DatasetRef` the `dataId` should be `None`. 

1082 Otherwise the `DatasetType` or name thereof. 

1083 dataId : `dict` or `DataCoordinate` 

1084 A `dict` of `Dimension` link name, value pairs that label the 

1085 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1086 should be provided as the first argument. 

1087 predict : `bool` 

1088 If `True`, allow URIs to be returned of datasets that have not 

1089 been written. 

1090 collections : Any, optional 

1091 Collections to be searched, overriding ``self.collections``. 

1092 Can be any of the types supported by the ``collections`` argument 

1093 to butler construction. 

1094 run : `str`, optional 

1095 Run to use for predictions, overriding ``self.run``. 

1096 kwds 

1097 Additional keyword arguments used to augment or construct a 

1098 `DataCoordinate`. See `DataCoordinate.standardize` 

1099 parameters. 

1100 

1101 Returns 

1102 ------- 

1103 uri : `ButlerURI` 

1104 URI pointing to the Dataset within the datastore. If the 

1105 Dataset does not exist in the datastore, and if ``predict`` is 

1106 `True`, the URI will be a prediction and will include a URI 

1107 fragment "#predicted". 

1108 If the datastore does not have entities that relate well 

1109 to the concept of a URI the returned URI string will be 

1110 descriptive. The returned URI is not guaranteed to be obtainable. 

1111 

1112 Raises 

1113 ------ 

1114 LookupError 

1115 A URI has been requested for a dataset that does not exist and 

1116 guessing is not allowed. 

1117 ValueError 

1118 Raised if a resolved `DatasetRef` was passed as an input, but it 

1119 differs from the one found in the registry. 

1120 TypeError 

1121 Raised if no collections were provided. 

1122 RuntimeError 

1123 Raised if a URI is requested for a dataset that consists of 

1124 multiple artifacts. 

1125 """ 

1126 primary, components = self.getURIs(datasetRefOrType, dataId=dataId, predict=predict, 

1127 collections=collections, run=run, **kwds) 

1128 

1129 if primary is None or components: 

1130 raise RuntimeError(f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1131 "Use Butler.getURIs() instead.") 

1132 return primary 

1133 

1134 def retrieveArtifacts(self, refs: Iterable[DatasetRef], 

1135 destination: Union[str, ButlerURI], transfer: str = "auto", 

1136 preserve_path: bool = True, 

1137 overwrite: bool = False) -> List[ButlerURI]: 

1138 """Retrieve the artifacts associated with the supplied refs. 

1139 

1140 Parameters 

1141 ---------- 

1142 refs : iterable of `DatasetRef` 

1143 The datasets for which artifacts are to be retrieved. 

1144 A single ref can result in multiple artifacts. The refs must 

1145 be resolved. 

1146 destination : `ButlerURI` or `str` 

1147 Location to write the artifacts. 

1148 transfer : `str`, optional 

1149 Method to use to transfer the artifacts. Must be one of the options 

1150 supported by `ButlerURI.transfer_from()`. "move" is not allowed. 

1151 preserve_path : `bool`, optional 

1152 If `True` the full path of the artifact within the datastore 

1153 is preserved. If `False` the final file component of the path 

1154 is used. 

1155 overwrite : `bool`, optional 

1156 If `True` allow transfers to overwrite existing files at the 

1157 destination. 

1158 

1159 Returns 

1160 ------- 

1161 targets : `list` of `ButlerURI` 

1162 URIs of file artifacts in destination location. Order is not 

1163 preserved. 

1164 

1165 Notes 

1166 ----- 

1167 For non-file datastores the artifacts written to the destination 

1168 may not match the representation inside the datastore. For example 

1169 a hierarchical data structure in a NoSQL database may well be stored 

1170 as a JSON file. 

1171 """ 

1172 return self.datastore.retrieveArtifacts(refs, ButlerURI(destination), transfer=transfer, 

1173 preserve_path=preserve_path, overwrite=overwrite) 

1174 

1175 def datasetExists(self, datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1176 dataId: Optional[DataId] = None, *, 

1177 collections: Any = None, 

1178 **kwds: Any) -> bool: 

1179 """Return True if the Dataset is actually present in the Datastore. 

1180 

1181 Parameters 

1182 ---------- 

1183 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1184 When `DatasetRef` the `dataId` should be `None`. 

1185 Otherwise the `DatasetType` or name thereof. 

1186 dataId : `dict` or `DataCoordinate` 

1187 A `dict` of `Dimension` link name, value pairs that label the 

1188 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1189 should be provided as the first argument. 

1190 collections : Any, optional 

1191 Collections to be searched, overriding ``self.collections``. 

1192 Can be any of the types supported by the ``collections`` argument 

1193 to butler construction. 

1194 kwds 

1195 Additional keyword arguments used to augment or construct a 

1196 `DataCoordinate`. See `DataCoordinate.standardize` 

1197 parameters. 

1198 

1199 Raises 

1200 ------ 

1201 LookupError 

1202 Raised if the dataset is not even present in the Registry. 

1203 ValueError 

1204 Raised if a resolved `DatasetRef` was passed as an input, but it 

1205 differs from the one found in the registry. 

1206 TypeError 

1207 Raised if no collections were provided. 

1208 """ 

1209 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwds) 

1210 return self.datastore.exists(ref) 

1211 

1212 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1213 """Remove one or more `~CollectionType.RUN` collections and the 

1214 datasets within them. 

1215 

1216 Parameters 

1217 ---------- 

1218 names : `Iterable` [ `str` ] 

1219 The names of the collections to remove. 

1220 unstore : `bool`, optional 

1221 If `True` (default), delete datasets from all datastores in which 

1222 they are present, and attempt to rollback the registry deletions if 

1223 datastore deletions fail (which may not always be possible). If 

1224 `False`, datastore records for these datasets are still removed, 

1225 but any artifacts (e.g. files) will not be. 

1226 

1227 Raises 

1228 ------ 

1229 TypeError 

1230 Raised if one or more collections are not of type 

1231 `~CollectionType.RUN`. 

1232 """ 

1233 if not self.isWriteable(): 

1234 raise TypeError("Butler is read-only.") 

1235 names = list(names) 

1236 refs: List[DatasetRef] = [] 

1237 for name in names: 

1238 collectionType = self.registry.getCollectionType(name) 

1239 if collectionType is not CollectionType.RUN: 

1240 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1241 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1242 with self.registry.transaction(): 

1243 if unstore: 

1244 for ref in refs: 

1245 if self.datastore.exists(ref): 

1246 self.datastore.trash(ref) 

1247 else: 

1248 self.datastore.forget(refs) 

1249 for name in names: 

1250 self.registry.removeCollection(name) 

1251 if unstore: 

1252 # Point of no return for removing artifacts 

1253 self.datastore.emptyTrash() 

1254 

1255 def pruneCollection(self, name: str, purge: bool = False, unstore: bool = False, 

1256 unlink: Optional[List[str]] = None) -> None: 

1257 """Remove a collection and possibly prune datasets within it. 

1258 

1259 Parameters 

1260 ---------- 

1261 name : `str` 

1262 Name of the collection to remove. If this is a 

1263 `~CollectionType.TAGGED` or `~CollectionType.CHAINED` collection, 

1264 datasets within the collection are not modified unless ``unstore`` 

1265 is `True`. If this is a `~CollectionType.RUN` collection, 

1266 ``purge`` and ``unstore`` must be `True`, and all datasets in it 

1267 are fully removed from the data repository. 

1268 purge : `bool`, optional 

1269 If `True`, permit `~CollectionType.RUN` collections to be removed, 

1270 fully removing datasets within them. Requires ``unstore=True`` as 

1271 well as an added precaution against accidental deletion. Must be 

1272 `False` (default) if the collection is not a ``RUN``. 

1273 unstore: `bool`, optional 

1274 If `True`, remove all datasets in the collection from all 

1275 datastores in which they appear. 

1276 unlink: `list` [`str`], optional 

1277 Before removing the given `collection` unlink it from from these 

1278 parent collections. 

1279 

1280 Raises 

1281 ------ 

1282 TypeError 

1283 Raised if the butler is read-only or arguments are mutually 

1284 inconsistent. 

1285 """ 

1286 

1287 # See pruneDatasets comments for more information about the logic here; 

1288 # the cases are almost the same, but here we can rely on Registry to 

1289 # take care everything but Datastore deletion when we remove the 

1290 # collection. 

1291 if not self.isWriteable(): 

1292 raise TypeError("Butler is read-only.") 

1293 collectionType = self.registry.getCollectionType(name) 

1294 if purge and not unstore: 

1295 raise PurgeWithoutUnstorePruneCollectionsError() 

1296 if collectionType is CollectionType.RUN and not purge: 

1297 raise RunWithoutPurgePruneCollectionsError(collectionType) 

1298 if collectionType is not CollectionType.RUN and purge: 

1299 raise PurgeUnsupportedPruneCollectionsError(collectionType) 

1300 

1301 def remove(child: str, parent: str) -> None: 

1302 """Remove a child collection from a parent collection.""" 

1303 # Remove child from parent. 

1304 chain = list(self.registry.getCollectionChain(parent)) 

1305 try: 

1306 chain.remove(name) 

1307 except ValueError as e: 

1308 raise RuntimeError(f"{name} is not a child of {parent}") from e 

1309 self.registry.setCollectionChain(parent, chain) 

1310 

1311 with self.registry.transaction(): 

1312 if (unlink): 

1313 for parent in unlink: 

1314 remove(name, parent) 

1315 if unstore: 

1316 for ref in self.registry.queryDatasets(..., collections=name, findFirst=True): 

1317 if self.datastore.exists(ref): 

1318 self.datastore.trash(ref) 

1319 self.registry.removeCollection(name) 

1320 if unstore: 

1321 # Point of no return for removing artifacts 

1322 self.datastore.emptyTrash() 

1323 

1324 def pruneDatasets(self, refs: Iterable[DatasetRef], *, 

1325 disassociate: bool = True, 

1326 unstore: bool = False, 

1327 tags: Iterable[str] = (), 

1328 purge: bool = False, 

1329 run: Optional[str] = None) -> None: 

1330 """Remove one or more datasets from a collection and/or storage. 

1331 

1332 Parameters 

1333 ---------- 

1334 refs : `~collections.abc.Iterable` of `DatasetRef` 

1335 Datasets to prune. These must be "resolved" references (not just 

1336 a `DatasetType` and data ID). 

1337 disassociate : `bool`, optional 

1338 Disassociate pruned datasets from ``tags``, or from all collections 

1339 if ``purge=True``. 

1340 unstore : `bool`, optional 

1341 If `True` (`False` is default) remove these datasets from all 

1342 datastores known to this butler. Note that this will make it 

1343 impossible to retrieve these datasets even via other collections. 

1344 Datasets that are already not stored are ignored by this option. 

1345 tags : `Iterable` [ `str` ], optional 

1346 `~CollectionType.TAGGED` collections to disassociate the datasets 

1347 from. Ignored if ``disassociate`` is `False` or ``purge`` is 

1348 `True`. 

1349 purge : `bool`, optional 

1350 If `True` (`False` is default), completely remove the dataset from 

1351 the `Registry`. To prevent accidental deletions, ``purge`` may 

1352 only be `True` if all of the following conditions are met: 

1353 

1354 - All given datasets are in the given run. 

1355 - ``disassociate`` is `True`; 

1356 - ``unstore`` is `True`. 

1357 

1358 This mode may remove provenance information from datasets other 

1359 than those provided, and should be used with extreme care. 

1360 

1361 Raises 

1362 ------ 

1363 TypeError 

1364 Raised if the butler is read-only, if no collection was provided, 

1365 or the conditions for ``purge=True`` were not met. 

1366 """ 

1367 if not self.isWriteable(): 

1368 raise TypeError("Butler is read-only.") 

1369 if purge: 

1370 if not disassociate: 

1371 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1372 if not unstore: 

1373 raise TypeError("Cannot pass purge=True without unstore=True.") 

1374 elif disassociate: 

1375 tags = tuple(tags) 

1376 if not tags: 

1377 raise TypeError("No tags provided but disassociate=True.") 

1378 for tag in tags: 

1379 collectionType = self.registry.getCollectionType(tag) 

1380 if collectionType is not CollectionType.TAGGED: 

1381 raise TypeError(f"Cannot disassociate from collection '{tag}' " 

1382 f"of non-TAGGED type {collectionType.name}.") 

1383 # Transform possibly-single-pass iterable into something we can iterate 

1384 # over multiple times. 

1385 refs = list(refs) 

1386 # Pruning a component of a DatasetRef makes no sense since registry 

1387 # doesn't know about components and datastore might not store 

1388 # components in a separate file 

1389 for ref in refs: 

1390 if ref.datasetType.component(): 

1391 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1392 # We don't need an unreliable Datastore transaction for this, because 

1393 # we've been extra careful to ensure that Datastore.trash only involves 

1394 # mutating the Registry (it can _look_ at Datastore-specific things, 

1395 # but shouldn't change them), and hence all operations here are 

1396 # Registry operations. 

1397 with self.registry.transaction(): 

1398 if unstore: 

1399 for ref in refs: 

1400 # There is a difference between a concrete composite 

1401 # and virtual composite. In a virtual composite the 

1402 # datastore is never given the top level DatasetRef. In 

1403 # the concrete composite the datastore knows all the 

1404 # refs and will clean up itself if asked to remove the 

1405 # parent ref. We can not check configuration for this 

1406 # since we can not trust that the configuration is the 

1407 # same. We therefore have to ask if the ref exists or 

1408 # not. This is consistent with the fact that we want 

1409 # to ignore already-removed-from-datastore datasets 

1410 # anyway. 

1411 if self.datastore.exists(ref): 

1412 self.datastore.trash(ref) 

1413 if purge: 

1414 self.registry.removeDatasets(refs) 

1415 elif disassociate: 

1416 assert tags, "Guaranteed by earlier logic in this function." 

1417 for tag in tags: 

1418 self.registry.disassociate(tag, refs) 

1419 # We've exited the Registry transaction, and apparently committed. 

1420 # (if there was an exception, everything rolled back, and it's as if 

1421 # nothing happened - and we never get here). 

1422 # Datastore artifacts are not yet gone, but they're clearly marked 

1423 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1424 # problems we can try again later, and if manual administrative 

1425 # intervention is required, it's pretty clear what that should entail: 

1426 # deleting everything on disk and in private Datastore tables that is 

1427 # in the dataset_location_trash table. 

1428 if unstore: 

1429 # Point of no return for removing artifacts 

1430 self.datastore.emptyTrash() 

1431 

1432 @transactional 

1433 def ingest(self, *datasets: FileDataset, transfer: Optional[str] = "auto", run: Optional[str] = None, 

1434 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1435 ) -> None: 

1436 """Store and register one or more datasets that already exist on disk. 

1437 

1438 Parameters 

1439 ---------- 

1440 datasets : `FileDataset` 

1441 Each positional argument is a struct containing information about 

1442 a file to be ingested, including its path (either absolute or 

1443 relative to the datastore root, if applicable), a `DatasetRef`, 

1444 and optionally a formatter class or its fully-qualified string 

1445 name. If a formatter is not provided, the formatter that would be 

1446 used for `put` is assumed. On successful return, all 

1447 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1448 attribute populated and all `FileDataset.formatter` attributes will 

1449 be set to the formatter class used. `FileDataset.path` attributes 

1450 may be modified to put paths in whatever the datastore considers a 

1451 standardized form. 

1452 transfer : `str`, optional 

1453 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1454 'hardlink', 'relsymlink' or 'symlink', indicating how to transfer 

1455 the file. 

1456 run : `str`, optional 

1457 The name of the run ingested datasets should be added to, 

1458 overriding ``self.run``. 

1459 idGenerationMode : `DatasetIdGenEnum`, optional 

1460 Specifies option for generating dataset IDs. By default unique IDs 

1461 are generated for each inserted dataset. 

1462 

1463 Raises 

1464 ------ 

1465 TypeError 

1466 Raised if the butler is read-only or if no run was provided. 

1467 NotImplementedError 

1468 Raised if the `Datastore` does not support the given transfer mode. 

1469 DatasetTypeNotSupportedError 

1470 Raised if one or more files to be ingested have a dataset type that 

1471 is not supported by the `Datastore`.. 

1472 FileNotFoundError 

1473 Raised if one of the given files does not exist. 

1474 FileExistsError 

1475 Raised if transfer is not `None` but the (internal) location the 

1476 file would be moved to is already occupied. 

1477 

1478 Notes 

1479 ----- 

1480 This operation is not fully exception safe: if a database operation 

1481 fails, the given `FileDataset` instances may be only partially updated. 

1482 

1483 It is atomic in terms of database operations (they will either all 

1484 succeed or all fail) providing the database engine implements 

1485 transactions correctly. It will attempt to be atomic in terms of 

1486 filesystem operations as well, but this cannot be implemented 

1487 rigorously for most datastores. 

1488 """ 

1489 if not self.isWriteable(): 

1490 raise TypeError("Butler is read-only.") 

1491 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1492 # Reorganize the inputs so they're grouped by DatasetType and then 

1493 # data ID. We also include a list of DatasetRefs for each FileDataset 

1494 # to hold the resolved DatasetRefs returned by the Registry, before 

1495 # it's safe to swap them into FileDataset.refs. 

1496 # Some type annotation aliases to make that clearer: 

1497 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1498 GroupedData = MutableMapping[DatasetType, GroupForType] 

1499 # The actual data structure: 

1500 groupedData: GroupedData = defaultdict(dict) 

1501 # And the nested loop that populates it: 

1502 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1503 # This list intentionally shared across the inner loop, since it's 

1504 # associated with `dataset`. 

1505 resolvedRefs: List[DatasetRef] = [] 

1506 for ref in dataset.refs: 

1507 if ref.dataId in groupedData[ref.datasetType]: 

1508 raise ConflictingDefinitionError(f"Ingest conflict. Dataset {dataset.path} has same" 

1509 " DataId as other ingest dataset" 

1510 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1511 f" ({ref.dataId})") 

1512 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1513 

1514 # Now we can bulk-insert into Registry for each DatasetType. 

1515 allResolvedRefs: List[DatasetRef] = [] 

1516 for datasetType, groupForType in progress.iter_item_chunks(groupedData.items(), 

1517 desc="Bulk-inserting datasets by type"): 

1518 refs = self.registry.insertDatasets( 

1519 datasetType, 

1520 dataIds=groupForType.keys(), 

1521 run=run, 

1522 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1523 idGenerationMode=idGenerationMode, 

1524 ) 

1525 # Append those resolved DatasetRefs to the new lists we set up for 

1526 # them. 

1527 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1528 resolvedRefs.append(ref) 

1529 

1530 # Go back to the original FileDatasets to replace their refs with the 

1531 # new resolved ones, and also build a big list of all refs. 

1532 allResolvedRefs = [] 

1533 for groupForType in progress.iter_chunks(groupedData.values(), 

1534 desc="Reassociating resolved dataset refs with files"): 

1535 for dataset, resolvedRefs in groupForType.values(): 

1536 dataset.refs = resolvedRefs 

1537 allResolvedRefs.extend(resolvedRefs) 

1538 

1539 # Bulk-insert everything into Datastore. 

1540 self.datastore.ingest(*datasets, transfer=transfer) 

1541 

1542 @contextlib.contextmanager 

1543 def export(self, *, directory: Optional[str] = None, 

1544 filename: Optional[str] = None, 

1545 format: Optional[str] = None, 

1546 transfer: Optional[str] = None) -> Iterator[RepoExportContext]: 

1547 """Export datasets from the repository represented by this `Butler`. 

1548 

1549 This method is a context manager that returns a helper object 

1550 (`RepoExportContext`) that is used to indicate what information from 

1551 the repository should be exported. 

1552 

1553 Parameters 

1554 ---------- 

1555 directory : `str`, optional 

1556 Directory dataset files should be written to if ``transfer`` is not 

1557 `None`. 

1558 filename : `str`, optional 

1559 Name for the file that will include database information associated 

1560 with the exported datasets. If this is not an absolute path and 

1561 ``directory`` is not `None`, it will be written to ``directory`` 

1562 instead of the current working directory. Defaults to 

1563 "export.{format}". 

1564 format : `str`, optional 

1565 File format for the database information file. If `None`, the 

1566 extension of ``filename`` will be used. 

1567 transfer : `str`, optional 

1568 Transfer mode passed to `Datastore.export`. 

1569 

1570 Raises 

1571 ------ 

1572 TypeError 

1573 Raised if the set of arguments passed is inconsistent. 

1574 

1575 Examples 

1576 -------- 

1577 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1578 methods are used to provide the iterables over data IDs and/or datasets 

1579 to be exported:: 

1580 

1581 with butler.export("exports.yaml") as export: 

1582 # Export all flats, but none of the dimension element rows 

1583 # (i.e. data ID information) associated with them. 

1584 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1585 elements=()) 

1586 # Export all datasets that start with "deepCoadd_" and all of 

1587 # their associated data ID information. 

1588 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1589 """ 

1590 if directory is None and transfer is not None: 

1591 raise TypeError("Cannot transfer without providing a directory.") 

1592 if transfer == "move": 

1593 raise TypeError("Transfer may not be 'move': export is read-only") 

1594 if format is None: 

1595 if filename is None: 

1596 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1597 else: 

1598 _, format = os.path.splitext(filename) 

1599 elif filename is None: 

1600 filename = f"export.{format}" 

1601 if directory is not None: 

1602 filename = os.path.join(directory, filename) 

1603 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["export"]) 

1604 with open(filename, 'w') as stream: 

1605 backend = BackendClass(stream) 

1606 try: 

1607 helper = RepoExportContext(self.registry, self.datastore, backend=backend, 

1608 directory=directory, transfer=transfer) 

1609 yield helper 

1610 except BaseException: 

1611 raise 

1612 else: 

1613 helper._finish() 

1614 

1615 def import_(self, *, directory: Optional[str] = None, 

1616 filename: Union[str, TextIO, None] = None, 

1617 format: Optional[str] = None, 

1618 transfer: Optional[str] = None, 

1619 skip_dimensions: Optional[Set] = None, 

1620 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1621 reuseIds: bool = False) -> None: 

1622 """Import datasets into this repository that were exported from a 

1623 different butler repository via `~lsst.daf.butler.Butler.export`. 

1624 

1625 Parameters 

1626 ---------- 

1627 directory : `str`, optional 

1628 Directory containing dataset files to import from. If `None`, 

1629 ``filename`` and all dataset file paths specified therein must 

1630 be absolute. 

1631 filename : `str` or `TextIO`, optional 

1632 A stream or name of file that contains database information 

1633 associated with the exported datasets, typically generated by 

1634 `~lsst.daf.butler.Butler.export`. If this a string (name) and 

1635 is not an absolute path, does not exist in the current working 

1636 directory, and ``directory`` is not `None`, it is assumed to be in 

1637 ``directory``. Defaults to "export.{format}". 

1638 format : `str`, optional 

1639 File format for ``filename``. If `None`, the extension of 

1640 ``filename`` will be used. 

1641 transfer : `str`, optional 

1642 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

1643 skip_dimensions : `set`, optional 

1644 Names of dimensions that should be skipped and not imported. 

1645 idGenerationMode : `DatasetIdGenEnum`, optional 

1646 Specifies option for generating dataset IDs when IDs are not 

1647 provided or their type does not match backend type. By default 

1648 unique IDs are generated for each inserted dataset. 

1649 reuseIds : `bool`, optional 

1650 If `True` then forces re-use of imported dataset IDs for integer 

1651 IDs which are normally generated as auto-incremented; exception 

1652 will be raised if imported IDs clash with existing ones. This 

1653 option has no effect on the use of globally-unique IDs which are 

1654 always re-used (or generated if integer IDs are being imported). 

1655 

1656 Raises 

1657 ------ 

1658 TypeError 

1659 Raised if the set of arguments passed is inconsistent, or if the 

1660 butler is read-only. 

1661 """ 

1662 if not self.isWriteable(): 

1663 raise TypeError("Butler is read-only.") 

1664 if format is None: 

1665 if filename is None: 

1666 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1667 else: 

1668 _, format = os.path.splitext(filename) # type: ignore 

1669 elif filename is None: 

1670 filename = f"export.{format}" 

1671 if isinstance(filename, str) and directory is not None and not os.path.exists(filename): 

1672 filename = os.path.join(directory, filename) 

1673 BackendClass = getClassOf(self._config["repo_transfer_formats"][format]["import"]) 

1674 

1675 def doImport(importStream: TextIO) -> None: 

1676 backend = BackendClass(importStream, self.registry) 

1677 backend.register() 

1678 with self.transaction(): 

1679 backend.load(self.datastore, directory=directory, transfer=transfer, 

1680 skip_dimensions=skip_dimensions, idGenerationMode=idGenerationMode, 

1681 reuseIds=reuseIds) 

1682 

1683 if isinstance(filename, str): 

1684 with open(filename, "r") as stream: 

1685 doImport(stream) 

1686 else: 

1687 doImport(filename) 

1688 

1689 def validateConfiguration(self, logFailures: bool = False, 

1690 datasetTypeNames: Optional[Iterable[str]] = None, 

1691 ignore: Iterable[str] = None) -> None: 

1692 """Validate butler configuration. 

1693 

1694 Checks that each `DatasetType` can be stored in the `Datastore`. 

1695 

1696 Parameters 

1697 ---------- 

1698 logFailures : `bool`, optional 

1699 If `True`, output a log message for every validation error 

1700 detected. 

1701 datasetTypeNames : iterable of `str`, optional 

1702 The `DatasetType` names that should be checked. This allows 

1703 only a subset to be selected. 

1704 ignore : iterable of `str`, optional 

1705 Names of DatasetTypes to skip over. This can be used to skip 

1706 known problems. If a named `DatasetType` corresponds to a 

1707 composite, all components of that `DatasetType` will also be 

1708 ignored. 

1709 

1710 Raises 

1711 ------ 

1712 ButlerValidationError 

1713 Raised if there is some inconsistency with how this Butler 

1714 is configured. 

1715 """ 

1716 if datasetTypeNames: 

1717 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

1718 else: 

1719 datasetTypes = list(self.registry.queryDatasetTypes()) 

1720 

1721 # filter out anything from the ignore list 

1722 if ignore: 

1723 ignore = set(ignore) 

1724 datasetTypes = [e for e in datasetTypes 

1725 if e.name not in ignore and e.nameAndComponent()[0] not in ignore] 

1726 else: 

1727 ignore = set() 

1728 

1729 # Find all the registered instruments 

1730 instruments = set( 

1731 record.name for record in self.registry.queryDimensionRecords("instrument") 

1732 ) 

1733 

1734 # For each datasetType that has an instrument dimension, create 

1735 # a DatasetRef for each defined instrument 

1736 datasetRefs = [] 

1737 

1738 for datasetType in datasetTypes: 

1739 if "instrument" in datasetType.dimensions: 

1740 for instrument in instruments: 

1741 datasetRef = DatasetRef(datasetType, {"instrument": instrument}, # type: ignore 

1742 conform=False) 

1743 datasetRefs.append(datasetRef) 

1744 

1745 entities: List[Union[DatasetType, DatasetRef]] = [] 

1746 entities.extend(datasetTypes) 

1747 entities.extend(datasetRefs) 

1748 

1749 datastoreErrorStr = None 

1750 try: 

1751 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

1752 except ValidationError as e: 

1753 datastoreErrorStr = str(e) 

1754 

1755 # Also check that the LookupKeys used by the datastores match 

1756 # registry and storage class definitions 

1757 keys = self.datastore.getLookupKeys() 

1758 

1759 failedNames = set() 

1760 failedDataId = set() 

1761 for key in keys: 

1762 if key.name is not None: 

1763 if key.name in ignore: 

1764 continue 

1765 

1766 # skip if specific datasetType names were requested and this 

1767 # name does not match 

1768 if datasetTypeNames and key.name not in datasetTypeNames: 

1769 continue 

1770 

1771 # See if it is a StorageClass or a DatasetType 

1772 if key.name in self.storageClasses: 

1773 pass 

1774 else: 

1775 try: 

1776 self.registry.getDatasetType(key.name) 

1777 except KeyError: 

1778 if logFailures: 

1779 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

1780 failedNames.add(key) 

1781 else: 

1782 # Dimensions are checked for consistency when the Butler 

1783 # is created and rendezvoused with a universe. 

1784 pass 

1785 

1786 # Check that the instrument is a valid instrument 

1787 # Currently only support instrument so check for that 

1788 if key.dataId: 

1789 dataIdKeys = set(key.dataId) 

1790 if set(["instrument"]) != dataIdKeys: 

1791 if logFailures: 

1792 log.critical("Key '%s' has unsupported DataId override", key) 

1793 failedDataId.add(key) 

1794 elif key.dataId["instrument"] not in instruments: 

1795 if logFailures: 

1796 log.critical("Key '%s' has unknown instrument", key) 

1797 failedDataId.add(key) 

1798 

1799 messages = [] 

1800 

1801 if datastoreErrorStr: 

1802 messages.append(datastoreErrorStr) 

1803 

1804 for failed, msg in ((failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

1805 (failedDataId, "Keys with bad DataId entries: ")): 

1806 if failed: 

1807 msg += ", ".join(str(k) for k in failed) 

1808 messages.append(msg) 

1809 

1810 if messages: 

1811 raise ValidationError(";\n".join(messages)) 

1812 

1813 @property 

1814 def collections(self) -> CollectionSearch: 

1815 """The collections to search by default, in order (`CollectionSearch`). 

1816 

1817 This is an alias for ``self.registry.defaults.collections``. It cannot 

1818 be set directly in isolation, but all defaults may be changed together 

1819 by assigning a new `RegistryDefaults` instance to 

1820 ``self.registry.defaults``. 

1821 """ 

1822 return self.registry.defaults.collections 

1823 

1824 @property 

1825 def run(self) -> Optional[str]: 

1826 """Name of the run this butler writes outputs to by default (`str` or 

1827 `None`). 

1828 

1829 This is an alias for ``self.registry.defaults.run``. It cannot be set 

1830 directly in isolation, but all defaults may be changed together by 

1831 assigning a new `RegistryDefaults` instance to 

1832 ``self.registry.defaults``. 

1833 """ 

1834 return self.registry.defaults.run 

1835 

1836 registry: Registry 

1837 """The object that manages dataset metadata and relationships (`Registry`). 

1838 

1839 Most operations that don't involve reading or writing butler datasets are 

1840 accessible only via `Registry` methods. 

1841 """ 

1842 

1843 datastore: Datastore 

1844 """The object that manages actual dataset storage (`Datastore`). 

1845 

1846 Direct user access to the datastore should rarely be necessary; the primary 

1847 exception is the case where a `Datastore` implementation provides extra 

1848 functionality beyond what the base class defines. 

1849 """ 

1850 

1851 storageClasses: StorageClassFactory 

1852 """An object that maps known storage class names to objects that fully 

1853 describe them (`StorageClassFactory`). 

1854 """