Coverage for python/lsst/daf/butler/_butler.py: 8%

674 statements  

« prev     ^ index     » next       coverage.py v7.2.6, created at 2023-05-26 02:11 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30) 

31 

32import collections.abc 

33import contextlib 

34import io 

35import logging 

36import numbers 

37import os 

38import warnings 

39from collections import defaultdict 

40from typing import ( 

41 TYPE_CHECKING, 

42 Any, 

43 ClassVar, 

44 Counter, 

45 Dict, 

46 Iterable, 

47 Iterator, 

48 List, 

49 MutableMapping, 

50 Optional, 

51 Sequence, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from deprecated.sphinx import deprecated 

60from lsst.resources import ResourcePath, ResourcePathExpression 

61from lsst.utils import doImportType 

62from lsst.utils.introspection import get_class_of 

63from lsst.utils.logging import VERBOSE, getLogger 

64from sqlalchemy.exc import IntegrityError 

65 

66from ._butlerConfig import ButlerConfig 

67from ._butlerRepoIndex import ButlerRepoIndex 

68from ._deferredDatasetHandle import DeferredDatasetHandle 

69from ._limited_butler import LimitedButler 

70from .core import ( 

71 Config, 

72 ConfigSubset, 

73 DataCoordinate, 

74 DataId, 

75 DataIdValue, 

76 DatasetIdGenEnum, 

77 DatasetRef, 

78 DatasetRefURIs, 

79 DatasetType, 

80 Datastore, 

81 Dimension, 

82 DimensionConfig, 

83 DimensionElement, 

84 DimensionRecord, 

85 DimensionUniverse, 

86 FileDataset, 

87 Progress, 

88 StorageClass, 

89 StorageClassFactory, 

90 Timespan, 

91 ValidationError, 

92) 

93from .core.repoRelocation import BUTLER_ROOT_TAG 

94from .core.utils import transactional 

95from .registry import ( 

96 CollectionType, 

97 ConflictingDefinitionError, 

98 DataIdError, 

99 MissingDatasetTypeError, 

100 Registry, 

101 RegistryConfig, 

102 RegistryDefaults, 

103) 

104from .transfers import RepoExportContext 

105 

106if TYPE_CHECKING: 

107 from lsst.resources import ResourceHandleProtocol 

108 

109 from .transfers import RepoImportBackend 

110 

111log = getLogger(__name__) 

112 

113 

114class ButlerValidationError(ValidationError): 

115 """There is a problem with the Butler configuration.""" 

116 

117 pass 

118 

119 

120class Butler(LimitedButler): 

121 """Main entry point for the data access system. 

122 

123 Parameters 

124 ---------- 

125 config : `ButlerConfig`, `Config` or `str`, optional. 

126 Configuration. Anything acceptable to the 

127 `ButlerConfig` constructor. If a directory path 

128 is given the configuration will be read from a ``butler.yaml`` file in 

129 that location. If `None` is given default values will be used. 

130 butler : `Butler`, optional. 

131 If provided, construct a new Butler that uses the same registry and 

132 datastore as the given one, but with the given collection and run. 

133 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

134 arguments. 

135 collections : `str` or `Iterable` [ `str` ], optional 

136 An expression specifying the collections to be searched (in order) when 

137 reading datasets. 

138 This may be a `str` collection name or an iterable thereof. 

139 See :ref:`daf_butler_collection_expressions` for more information. 

140 These collections are not registered automatically and must be 

141 manually registered before they are used by any method, but they may be 

142 manually registered after the `Butler` is initialized. 

143 run : `str`, optional 

144 Name of the `~CollectionType.RUN` collection new datasets should be 

145 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

146 ``collections`` will be set to ``[run]``. If not `None`, this 

147 collection will automatically be registered. If this is not set (and 

148 ``writeable`` is not set either), a read-only butler will be created. 

149 searchPaths : `list` of `str`, optional 

150 Directory paths to search when calculating the full Butler 

151 configuration. Not used if the supplied config is already a 

152 `ButlerConfig`. 

153 writeable : `bool`, optional 

154 Explicitly sets whether the butler supports write operations. If not 

155 provided, a read-write butler is created if any of ``run``, ``tags``, 

156 or ``chains`` is non-empty. 

157 inferDefaults : `bool`, optional 

158 If `True` (default) infer default data ID values from the values 

159 present in the datasets in ``collections``: if all collections have the 

160 same value (or no value) for a governor dimension, that value will be 

161 the default for that dimension. Nonexistent collections are ignored. 

162 If a default value is provided explicitly for a governor dimension via 

163 ``**kwargs``, no default will be inferred for that dimension. 

164 **kwargs : `str` 

165 Default data ID key-value pairs. These may only identify "governor" 

166 dimensions like ``instrument`` and ``skymap``. 

167 

168 Examples 

169 -------- 

170 While there are many ways to control exactly how a `Butler` interacts with 

171 the collections in its `Registry`, the most common cases are still simple. 

172 

173 For a read-only `Butler` that searches one collection, do:: 

174 

175 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

176 

177 For a read-write `Butler` that writes to and reads from a 

178 `~CollectionType.RUN` collection:: 

179 

180 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

181 

182 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

183 because we want to write to one `~CollectionType.RUN` collection but read 

184 from several others (as well):: 

185 

186 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

187 collections=["u/alice/DM-50000/a", 

188 "u/bob/DM-49998", 

189 "HSC/defaults"]) 

190 

191 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

192 Datasets will be read first from that run (since it appears first in the 

193 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

194 

195 Finally, one can always create a `Butler` with no collections:: 

196 

197 butler = Butler("/path/to/repo", writeable=True) 

198 

199 This can be extremely useful when you just want to use ``butler.registry``, 

200 e.g. for inserting dimension data or managing collections, or when the 

201 collections you want to use with the butler are not consistent. 

202 Passing ``writeable`` explicitly here is only necessary if you want to be 

203 able to make changes to the repo - usually the value for ``writeable`` can 

204 be guessed from the collection arguments provided, but it defaults to 

205 `False` when there are not collection arguments. 

206 """ 

207 

208 def __init__( 

209 self, 

210 config: Union[Config, ResourcePathExpression, None] = None, 

211 *, 

212 butler: Optional[Butler] = None, 

213 collections: Any = None, 

214 run: Optional[str] = None, 

215 searchPaths: Optional[Sequence[ResourcePathExpression]] = None, 

216 writeable: Optional[bool] = None, 

217 inferDefaults: bool = True, 

218 **kwargs: str, 

219 ): 

220 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

221 # Load registry, datastore, etc. from config or existing butler. 

222 if butler is not None: 

223 if config is not None or searchPaths is not None or writeable is not None: 

224 raise TypeError( 

225 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

226 ) 

227 self.registry = butler.registry.copy(defaults) 

228 self.datastore = butler.datastore 

229 self.storageClasses = butler.storageClasses 

230 self._config: ButlerConfig = butler._config 

231 else: 

232 # Can only look for strings in the known repos list. 

233 if isinstance(config, str) and config in self.get_known_repos(): 

234 config = str(self.get_repo_uri(config)) 

235 try: 

236 self._config = ButlerConfig(config, searchPaths=searchPaths) 

237 except FileNotFoundError as e: 

238 if known := self.get_known_repos(): 

239 aliases = f"(known aliases: {', '.join(known)})" 

240 else: 

241 aliases = "(no known aliases)" 

242 raise FileNotFoundError(f"{e} {aliases}") from e 

243 try: 

244 if "root" in self._config: 

245 butlerRoot = self._config["root"] 

246 else: 

247 butlerRoot = self._config.configDir 

248 if writeable is None: 

249 writeable = run is not None 

250 self.registry = Registry.fromConfig( 

251 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

252 ) 

253 self.datastore = Datastore.fromConfig( 

254 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

255 ) 

256 self.storageClasses = StorageClassFactory() 

257 self.storageClasses.addFromConfig(self._config) 

258 except Exception: 

259 # Failures here usually mean that configuration is incomplete, 

260 # just issue an error message which includes config file URI. 

261 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

262 raise 

263 

264 # For execution butler the datastore needs a special 

265 # dependency-inversion trick. This is not used by regular butler, 

266 # but we do not have a way to distinguish regular butler from execution 

267 # butler. 

268 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

269 

270 if "run" in self._config or "collection" in self._config: 

271 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

272 

273 GENERATION: ClassVar[int] = 3 

274 """This is a Generation 3 Butler. 

275 

276 This attribute may be removed in the future, once the Generation 2 Butler 

277 interface has been fully retired; it should only be used in transitional 

278 code. 

279 """ 

280 

281 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

282 """Return DatasetType defined in registry given dataset type name.""" 

283 try: 

284 return self.registry.getDatasetType(name) 

285 except MissingDatasetTypeError: 

286 return None 

287 

288 @classmethod 

289 def get_repo_uri(cls, label: str) -> ResourcePath: 

290 """Look up the label in a butler repository index. 

291 

292 Parameters 

293 ---------- 

294 label : `str` 

295 Label of the Butler repository to look up. 

296 

297 Returns 

298 ------- 

299 uri : `lsst.resources.ResourcePath` 

300 URI to the Butler repository associated with the given label. 

301 

302 Raises 

303 ------ 

304 KeyError 

305 Raised if the label is not found in the index, or if an index 

306 can not be found at all. 

307 

308 Notes 

309 ----- 

310 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

311 information is discovered. 

312 """ 

313 return ButlerRepoIndex.get_repo_uri(label) 

314 

315 @classmethod 

316 def get_known_repos(cls) -> Set[str]: 

317 """Retrieve the list of known repository labels. 

318 

319 Returns 

320 ------- 

321 repos : `set` of `str` 

322 All the known labels. Can be empty if no index can be found. 

323 

324 Notes 

325 ----- 

326 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

327 information is discovered. 

328 """ 

329 return ButlerRepoIndex.get_known_repos() 

330 

331 @staticmethod 

332 def makeRepo( 

333 root: ResourcePathExpression, 

334 config: Union[Config, str, None] = None, 

335 dimensionConfig: Union[Config, str, None] = None, 

336 standalone: bool = False, 

337 searchPaths: Optional[List[str]] = None, 

338 forceConfigRoot: bool = True, 

339 outfile: Optional[ResourcePathExpression] = None, 

340 overwrite: bool = False, 

341 ) -> Config: 

342 """Create an empty data repository by adding a butler.yaml config 

343 to a repository root directory. 

344 

345 Parameters 

346 ---------- 

347 root : `lsst.resources.ResourcePathExpression` 

348 Path or URI to the root location of the new repository. Will be 

349 created if it does not exist. 

350 config : `Config` or `str`, optional 

351 Configuration to write to the repository, after setting any 

352 root-dependent Registry or Datastore config options. Can not 

353 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

354 configuration will be used. Root-dependent config options 

355 specified in this config are overwritten if ``forceConfigRoot`` 

356 is `True`. 

357 dimensionConfig : `Config` or `str`, optional 

358 Configuration for dimensions, will be used to initialize registry 

359 database. 

360 standalone : `bool` 

361 If True, write all expanded defaults, not just customized or 

362 repository-specific settings. 

363 This (mostly) decouples the repository from the default 

364 configuration, insulating it from changes to the defaults (which 

365 may be good or bad, depending on the nature of the changes). 

366 Future *additions* to the defaults will still be picked up when 

367 initializing `Butlers` to repos created with ``standalone=True``. 

368 searchPaths : `list` of `str`, optional 

369 Directory paths to search when calculating the full butler 

370 configuration. 

371 forceConfigRoot : `bool`, optional 

372 If `False`, any values present in the supplied ``config`` that 

373 would normally be reset are not overridden and will appear 

374 directly in the output config. This allows non-standard overrides 

375 of the root directory for a datastore or registry to be given. 

376 If this parameter is `True` the values for ``root`` will be 

377 forced into the resulting config if appropriate. 

378 outfile : `lss.resources.ResourcePathExpression`, optional 

379 If not-`None`, the output configuration will be written to this 

380 location rather than into the repository itself. Can be a URI 

381 string. Can refer to a directory that will be used to write 

382 ``butler.yaml``. 

383 overwrite : `bool`, optional 

384 Create a new configuration file even if one already exists 

385 in the specified output location. Default is to raise 

386 an exception. 

387 

388 Returns 

389 ------- 

390 config : `Config` 

391 The updated `Config` instance written to the repo. 

392 

393 Raises 

394 ------ 

395 ValueError 

396 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

397 regular Config (as these subclasses would make it impossible to 

398 support ``standalone=False``). 

399 FileExistsError 

400 Raised if the output config file already exists. 

401 os.error 

402 Raised if the directory does not exist, exists but is not a 

403 directory, or cannot be created. 

404 

405 Notes 

406 ----- 

407 Note that when ``standalone=False`` (the default), the configuration 

408 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

409 construct the repository should also be used to construct any Butlers 

410 to avoid configuration inconsistencies. 

411 """ 

412 if isinstance(config, (ButlerConfig, ConfigSubset)): 

413 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

414 

415 # Ensure that the root of the repository exists or can be made 

416 root_uri = ResourcePath(root, forceDirectory=True) 

417 root_uri.mkdir() 

418 

419 config = Config(config) 

420 

421 # If we are creating a new repo from scratch with relative roots, 

422 # do not propagate an explicit root from the config file 

423 if "root" in config: 

424 del config["root"] 

425 

426 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

427 imported_class = doImportType(full["datastore", "cls"]) 

428 if not issubclass(imported_class, Datastore): 

429 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

430 datastoreClass: Type[Datastore] = imported_class 

431 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

432 

433 # if key exists in given config, parse it, otherwise parse the defaults 

434 # in the expanded config 

435 if config.get(("registry", "db")): 

436 registryConfig = RegistryConfig(config) 

437 else: 

438 registryConfig = RegistryConfig(full) 

439 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

440 if defaultDatabaseUri is not None: 

441 Config.updateParameters( 

442 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

443 ) 

444 else: 

445 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

446 

447 if standalone: 

448 config.merge(full) 

449 else: 

450 # Always expand the registry.managers section into the per-repo 

451 # config, because after the database schema is created, it's not 

452 # allowed to change anymore. Note that in the standalone=True 

453 # branch, _everything_ in the config is expanded, so there's no 

454 # need to special case this. 

455 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

456 configURI: ResourcePathExpression 

457 if outfile is not None: 

458 # When writing to a separate location we must include 

459 # the root of the butler repo in the config else it won't know 

460 # where to look. 

461 config["root"] = root_uri.geturl() 

462 configURI = outfile 

463 else: 

464 configURI = root_uri 

465 # Strip obscore configuration, if it is present, before writing config 

466 # to a file, obscore config will be stored in registry. 

467 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

468 config_to_write = config.copy() 

469 del config_to_write[obscore_config_key] 

470 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

471 # configFile attribute is updated, need to copy it to original. 

472 config.configFile = config_to_write.configFile 

473 else: 

474 config.dumpToUri(configURI, overwrite=overwrite) 

475 

476 # Create Registry and populate tables 

477 registryConfig = RegistryConfig(config.get("registry")) 

478 dimensionConfig = DimensionConfig(dimensionConfig) 

479 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

480 

481 log.verbose("Wrote new Butler configuration file to %s", configURI) 

482 

483 return config 

484 

485 @classmethod 

486 def _unpickle( 

487 cls, 

488 config: ButlerConfig, 

489 collections: Optional[tuple[str, ...]], 

490 run: Optional[str], 

491 defaultDataId: Dict[str, str], 

492 writeable: bool, 

493 ) -> Butler: 

494 """Callable used to unpickle a Butler. 

495 

496 We prefer not to use ``Butler.__init__`` directly so we can force some 

497 of its many arguments to be keyword-only (note that ``__reduce__`` 

498 can only invoke callables with positional arguments). 

499 

500 Parameters 

501 ---------- 

502 config : `ButlerConfig` 

503 Butler configuration, already coerced into a true `ButlerConfig` 

504 instance (and hence after any search paths for overrides have been 

505 utilized). 

506 collections : `tuple` [ `str` ] 

507 Names of the default collections to read from. 

508 run : `str`, optional 

509 Name of the default `~CollectionType.RUN` collection to write to. 

510 defaultDataId : `dict` [ `str`, `str` ] 

511 Default data ID values. 

512 writeable : `bool` 

513 Whether the Butler should support write operations. 

514 

515 Returns 

516 ------- 

517 butler : `Butler` 

518 A new `Butler` instance. 

519 """ 

520 # MyPy doesn't recognize that the kwargs below are totally valid; it 

521 # seems to think '**defaultDataId* is a _positional_ argument! 

522 return cls( 

523 config=config, 

524 collections=collections, 

525 run=run, 

526 writeable=writeable, 

527 **defaultDataId, # type: ignore 

528 ) 

529 

530 def __reduce__(self) -> tuple: 

531 """Support pickling.""" 

532 return ( 

533 Butler._unpickle, 

534 ( 

535 self._config, 

536 self.collections, 

537 self.run, 

538 self.registry.defaults.dataId.byName(), 

539 self.registry.isWriteable(), 

540 ), 

541 ) 

542 

543 def __str__(self) -> str: 

544 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

545 self.collections, self.run, self.datastore, self.registry 

546 ) 

547 

548 def isWriteable(self) -> bool: 

549 """Return `True` if this `Butler` supports write operations.""" 

550 return self.registry.isWriteable() 

551 

552 @contextlib.contextmanager 

553 def transaction(self) -> Iterator[None]: 

554 """Context manager supporting `Butler` transactions. 

555 

556 Transactions can be nested. 

557 """ 

558 with self.registry.transaction(): 

559 with self.datastore.transaction(): 

560 yield 

561 

562 def _standardizeArgs( 

563 self, 

564 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

565 dataId: Optional[DataId] = None, 

566 for_put: bool = True, 

567 **kwargs: Any, 

568 ) -> Tuple[DatasetType, Optional[DataId]]: 

569 """Standardize the arguments passed to several Butler APIs. 

570 

571 Parameters 

572 ---------- 

573 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

574 When `DatasetRef` the `dataId` should be `None`. 

575 Otherwise the `DatasetType` or name thereof. 

576 dataId : `dict` or `DataCoordinate` 

577 A `dict` of `Dimension` link name, value pairs that label the 

578 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

579 should be provided as the second argument. 

580 for_put : `bool`, optional 

581 If `True` this call is invoked as part of a `Butler.put()`. 

582 Otherwise it is assumed to be part of a `Butler.get()`. This 

583 parameter is only relevant if there is dataset type 

584 inconsistency. 

585 **kwargs 

586 Additional keyword arguments used to augment or construct a 

587 `DataCoordinate`. See `DataCoordinate.standardize` 

588 parameters. 

589 

590 Returns 

591 ------- 

592 datasetType : `DatasetType` 

593 A `DatasetType` instance extracted from ``datasetRefOrType``. 

594 dataId : `dict` or `DataId`, optional 

595 Argument that can be used (along with ``kwargs``) to construct a 

596 `DataId`. 

597 

598 Notes 

599 ----- 

600 Butler APIs that conceptually need a DatasetRef also allow passing a 

601 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

602 keyword arguments that can be used to construct one) separately. This 

603 method accepts those arguments and always returns a true `DatasetType` 

604 and a `DataId` or `dict`. 

605 

606 Standardization of `dict` vs `DataId` is best handled by passing the 

607 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

608 generally similarly flexible. 

609 """ 

610 externalDatasetType: Optional[DatasetType] = None 

611 internalDatasetType: Optional[DatasetType] = None 

612 if isinstance(datasetRefOrType, DatasetRef): 

613 if dataId is not None or kwargs: 

614 raise ValueError("DatasetRef given, cannot use dataId as well") 

615 externalDatasetType = datasetRefOrType.datasetType 

616 dataId = datasetRefOrType.dataId 

617 else: 

618 # Don't check whether DataId is provided, because Registry APIs 

619 # can usually construct a better error message when it wasn't. 

620 if isinstance(datasetRefOrType, DatasetType): 

621 externalDatasetType = datasetRefOrType 

622 else: 

623 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

624 

625 # Check that they are self-consistent 

626 if externalDatasetType is not None: 

627 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

628 if externalDatasetType != internalDatasetType: 

629 # We can allow differences if they are compatible, depending 

630 # on whether this is a get or a put. A get requires that 

631 # the python type associated with the datastore can be 

632 # converted to the user type. A put requires that the user 

633 # supplied python type can be converted to the internal 

634 # type expected by registry. 

635 relevantDatasetType = internalDatasetType 

636 if for_put: 

637 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

638 else: 

639 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

640 relevantDatasetType = externalDatasetType 

641 if not is_compatible: 

642 raise ValueError( 

643 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

644 f"registry definition ({internalDatasetType})" 

645 ) 

646 # Override the internal definition. 

647 internalDatasetType = relevantDatasetType 

648 

649 assert internalDatasetType is not None 

650 return internalDatasetType, dataId 

651 

652 def _rewrite_data_id( 

653 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

654 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

655 """Rewrite a data ID taking into account dimension records. 

656 

657 Take a Data ID and keyword args and rewrite it if necessary to 

658 allow the user to specify dimension records rather than dimension 

659 primary values. 

660 

661 This allows a user to include a dataId dict with keys of 

662 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

663 the integer exposure ID. It also allows a string to be given 

664 for a dimension value rather than the integer ID if that is more 

665 convenient. For example, rather than having to specifyin the 

666 detector with ``detector.full_name``, a string given for ``detector`` 

667 will be interpreted as the full name and converted to the integer 

668 value. 

669 

670 Keyword arguments can also use strings for dimensions like detector 

671 and exposure but python does not allow them to include ``.`` and 

672 so the ``exposure.day_obs`` syntax can not be used in a keyword 

673 argument. 

674 

675 Parameters 

676 ---------- 

677 dataId : `dict` or `DataCoordinate` 

678 A `dict` of `Dimension` link name, value pairs that will label the 

679 `DatasetRef` within a Collection. 

680 datasetType : `DatasetType` 

681 The dataset type associated with this dataId. Required to 

682 determine the relevant dimensions. 

683 **kwargs 

684 Additional keyword arguments used to augment or construct a 

685 `DataId`. See `DataId` parameters. 

686 

687 Returns 

688 ------- 

689 dataId : `dict` or `DataCoordinate` 

690 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

691 no keyword arguments, the original dataId will be returned 

692 unchanged. 

693 **kwargs : `dict` 

694 Any unused keyword arguments (would normally be empty dict). 

695 """ 

696 # Do nothing if we have a standalone DataCoordinate. 

697 if isinstance(dataId, DataCoordinate) and not kwargs: 

698 return dataId, kwargs 

699 

700 # Process dimension records that are using record information 

701 # rather than ids 

702 newDataId: Dict[str, DataIdValue] = {} 

703 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

704 

705 # if all the dataId comes from keyword parameters we do not need 

706 # to do anything here because they can't be of the form 

707 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

708 if dataId: 

709 for k, v in dataId.items(): 

710 # If we have a Dimension we do not need to do anything 

711 # because it cannot be a compound key. 

712 if isinstance(k, str) and "." in k: 

713 # Someone is using a more human-readable dataId 

714 dimensionName, record = k.split(".", 1) 

715 byRecord[dimensionName][record] = v 

716 elif isinstance(k, Dimension): 

717 newDataId[k.name] = v 

718 else: 

719 newDataId[k] = v 

720 

721 # Go through the updated dataId and check the type in case someone is 

722 # using an alternate key. We have already filtered out the compound 

723 # keys dimensions.record format. 

724 not_dimensions = {} 

725 

726 # Will need to look in the dataId and the keyword arguments 

727 # and will remove them if they need to be fixed or are unrecognized. 

728 for dataIdDict in (newDataId, kwargs): 

729 # Use a list so we can adjust the dict safely in the loop 

730 for dimensionName in list(dataIdDict): 

731 value = dataIdDict[dimensionName] 

732 try: 

733 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

734 except KeyError: 

735 # This is not a real dimension 

736 not_dimensions[dimensionName] = value 

737 del dataIdDict[dimensionName] 

738 continue 

739 

740 # Convert an integral type to an explicit int to simplify 

741 # comparisons here 

742 if isinstance(value, numbers.Integral): 

743 value = int(value) 

744 

745 if not isinstance(value, dimension.primaryKey.getPythonType()): 

746 for alternate in dimension.alternateKeys: 

747 if isinstance(value, alternate.getPythonType()): 

748 byRecord[dimensionName][alternate.name] = value 

749 del dataIdDict[dimensionName] 

750 log.debug( 

751 "Converting dimension %s to %s.%s=%s", 

752 dimensionName, 

753 dimensionName, 

754 alternate.name, 

755 value, 

756 ) 

757 break 

758 else: 

759 log.warning( 

760 "Type mismatch found for value '%r' provided for dimension %s. " 

761 "Could not find matching alternative (primary key has type %s) " 

762 "so attempting to use as-is.", 

763 value, 

764 dimensionName, 

765 dimension.primaryKey.getPythonType(), 

766 ) 

767 

768 # By this point kwargs and newDataId should only include valid 

769 # dimensions. Merge kwargs in to the new dataId and log if there 

770 # are dimensions in both (rather than calling update). 

771 for k, v in kwargs.items(): 

772 if k in newDataId and newDataId[k] != v: 

773 log.debug( 

774 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

775 ) 

776 newDataId[k] = v 

777 # No need to retain any values in kwargs now. 

778 kwargs = {} 

779 

780 # If we have some unrecognized dimensions we have to try to connect 

781 # them to records in other dimensions. This is made more complicated 

782 # by some dimensions having records with clashing names. A mitigation 

783 # is that we can tell by this point which dimensions are missing 

784 # for the DatasetType but this does not work for calibrations 

785 # where additional dimensions can be used to constrain the temporal 

786 # axis. 

787 if not_dimensions: 

788 # Search for all dimensions even if we have been given a value 

789 # explicitly. In some cases records are given as well as the 

790 # actually dimension and this should not be an error if they 

791 # match. 

792 mandatoryDimensions = datasetType.dimensions.names # - provided 

793 

794 candidateDimensions: Set[str] = set() 

795 candidateDimensions.update(mandatoryDimensions) 

796 

797 # For calibrations we may well be needing temporal dimensions 

798 # so rather than always including all dimensions in the scan 

799 # restrict things a little. It is still possible for there 

800 # to be confusion over day_obs in visit vs exposure for example. 

801 # If we are not searching calibration collections things may 

802 # fail but they are going to fail anyway because of the 

803 # ambiguousness of the dataId... 

804 if datasetType.isCalibration(): 

805 for dim in self.registry.dimensions.getStaticDimensions(): 

806 if dim.temporal: 

807 candidateDimensions.add(str(dim)) 

808 

809 # Look up table for the first association with a dimension 

810 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

811 

812 # Keep track of whether an item is associated with multiple 

813 # dimensions. 

814 counter: Counter[str] = Counter() 

815 assigned: Dict[str, Set[str]] = defaultdict(set) 

816 

817 # Go through the missing dimensions and associate the 

818 # given names with records within those dimensions 

819 matched_dims = set() 

820 for dimensionName in candidateDimensions: 

821 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

822 fields = dimension.metadata.names | dimension.uniqueKeys.names 

823 for field in not_dimensions: 

824 if field in fields: 

825 guessedAssociation[dimensionName][field] = not_dimensions[field] 

826 counter[dimensionName] += 1 

827 assigned[field].add(dimensionName) 

828 matched_dims.add(field) 

829 

830 # Calculate the fields that matched nothing. 

831 never_found = set(not_dimensions) - matched_dims 

832 

833 if never_found: 

834 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

835 

836 # There is a chance we have allocated a single dataId item 

837 # to multiple dimensions. Need to decide which should be retained. 

838 # For now assume that the most popular alternative wins. 

839 # This means that day_obs with seq_num will result in 

840 # exposure.day_obs and not visit.day_obs 

841 # Also prefer an explicitly missing dimension over an inferred 

842 # temporal dimension. 

843 for fieldName, assignedDimensions in assigned.items(): 

844 if len(assignedDimensions) > 1: 

845 # Pick the most popular (preferring mandatory dimensions) 

846 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

847 if requiredButMissing: 

848 candidateDimensions = requiredButMissing 

849 else: 

850 candidateDimensions = assignedDimensions 

851 

852 # If this is a choice between visit and exposure and 

853 # neither was a required part of the dataset type, 

854 # (hence in this branch) always prefer exposure over 

855 # visit since exposures are always defined and visits 

856 # are defined from exposures. 

857 if candidateDimensions == {"exposure", "visit"}: 

858 candidateDimensions = {"exposure"} 

859 

860 # Select the relevant items and get a new restricted 

861 # counter. 

862 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

863 duplicatesCounter: Counter[str] = Counter() 

864 duplicatesCounter.update(theseCounts) 

865 

866 # Choose the most common. If they are equally common 

867 # we will pick the one that was found first. 

868 # Returns a list of tuples 

869 selected = duplicatesCounter.most_common(1)[0][0] 

870 

871 log.debug( 

872 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

873 " Removed ambiguity by choosing dimension %s.", 

874 fieldName, 

875 ", ".join(assignedDimensions), 

876 selected, 

877 ) 

878 

879 for candidateDimension in assignedDimensions: 

880 if candidateDimension != selected: 

881 del guessedAssociation[candidateDimension][fieldName] 

882 

883 # Update the record look up dict with the new associations 

884 for dimensionName, values in guessedAssociation.items(): 

885 if values: # A dict might now be empty 

886 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

887 byRecord[dimensionName].update(values) 

888 

889 if byRecord: 

890 # Some record specifiers were found so we need to convert 

891 # them to the Id form 

892 for dimensionName, values in byRecord.items(): 

893 if dimensionName in newDataId: 

894 log.debug( 

895 "DataId specified explicit %s dimension value of %s in addition to" 

896 " general record specifiers for it of %s. Ignoring record information.", 

897 dimensionName, 

898 newDataId[dimensionName], 

899 str(values), 

900 ) 

901 # Get the actual record and compare with these values. 

902 try: 

903 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

904 except DataIdError: 

905 raise ValueError( 

906 f"Could not find dimension '{dimensionName}'" 

907 f" with dataId {newDataId} as part of comparing with" 

908 f" record values {byRecord[dimensionName]}" 

909 ) from None 

910 if len(recs) == 1: 

911 errmsg: List[str] = [] 

912 for k, v in values.items(): 

913 if (recval := getattr(recs[0], k)) != v: 

914 errmsg.append(f"{k}({recval} != {v})") 

915 if errmsg: 

916 raise ValueError( 

917 f"Dimension {dimensionName} in dataId has explicit value" 

918 " inconsistent with records: " + ", ".join(errmsg) 

919 ) 

920 else: 

921 # Multiple matches for an explicit dimension 

922 # should never happen but let downstream complain. 

923 pass 

924 continue 

925 

926 # Build up a WHERE expression 

927 bind = {k: v for k, v in values.items()} 

928 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

929 

930 # Hopefully we get a single record that matches 

931 records = set( 

932 self.registry.queryDimensionRecords( 

933 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

934 ) 

935 ) 

936 

937 if len(records) != 1: 

938 if len(records) > 1: 

939 # visit can have an ambiguous answer without involving 

940 # visit_system. The default visit_system is defined 

941 # by the instrument. 

942 if ( 

943 dimensionName == "visit" 

944 and "visit_system_membership" in self.registry.dimensions 

945 and "visit_system" in self.registry.dimensions["instrument"].metadata 

946 ): 

947 instrument_records = list( 

948 self.registry.queryDimensionRecords( 

949 "instrument", 

950 dataId=newDataId, 

951 **kwargs, 

952 ) 

953 ) 

954 if len(instrument_records) == 1: 

955 visit_system = instrument_records[0].visit_system 

956 if visit_system is None: 

957 # Set to a value that will never match. 

958 visit_system = -1 

959 

960 # Look up each visit in the 

961 # visit_system_membership records. 

962 for rec in records: 

963 membership = list( 

964 self.registry.queryDimensionRecords( 

965 # Use bind to allow zero results. 

966 # This is a fully-specified query. 

967 "visit_system_membership", 

968 where="instrument = inst AND visit_system = system AND visit = v", 

969 bind=dict( 

970 inst=instrument_records[0].name, system=visit_system, v=rec.id 

971 ), 

972 ) 

973 ) 

974 if membership: 

975 # This record is the right answer. 

976 records = set([rec]) 

977 break 

978 

979 # The ambiguity may have been resolved so check again. 

980 if len(records) > 1: 

981 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

982 for r in records: 

983 log.debug("- %s", str(r)) 

984 raise ValueError( 

985 f"DataId specification for dimension {dimensionName} is not" 

986 f" uniquely constrained to a single dataset by {values}." 

987 f" Got {len(records)} results." 

988 ) 

989 else: 

990 raise ValueError( 

991 f"DataId specification for dimension {dimensionName} matched no" 

992 f" records when constrained by {values}" 

993 ) 

994 

995 # Get the primary key from the real dimension object 

996 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

997 if not isinstance(dimension, Dimension): 

998 raise RuntimeError( 

999 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1000 ) 

1001 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1002 

1003 return newDataId, kwargs 

1004 

1005 def _findDatasetRef( 

1006 self, 

1007 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1008 dataId: Optional[DataId] = None, 

1009 *, 

1010 collections: Any = None, 

1011 predict: bool = False, 

1012 run: str | None = None, 

1013 **kwargs: Any, 

1014 ) -> DatasetRef: 

1015 """Shared logic for methods that start with a search for a dataset in 

1016 the registry. 

1017 

1018 Parameters 

1019 ---------- 

1020 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1021 When `DatasetRef` the `dataId` should be `None`. 

1022 Otherwise the `DatasetType` or name thereof. 

1023 dataId : `dict` or `DataCoordinate`, optional 

1024 A `dict` of `Dimension` link name, value pairs that label the 

1025 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1026 should be provided as the first argument. 

1027 collections : Any, optional 

1028 Collections to be searched, overriding ``self.collections``. 

1029 Can be any of the types supported by the ``collections`` argument 

1030 to butler construction. 

1031 predict : `bool`, optional 

1032 If `True`, return a newly created `DatasetRef` with a unique 

1033 dataset ID if finding a reference in the `Registry` fails. 

1034 Defaults to `False`. 

1035 run : `str`, optional 

1036 Run collection name to use for creating `DatasetRef` for predicted 

1037 datasets. Only used if ``predict`` is `True`. 

1038 **kwargs 

1039 Additional keyword arguments used to augment or construct a 

1040 `DataId`. See `DataId` parameters. 

1041 

1042 Returns 

1043 ------- 

1044 ref : `DatasetRef` 

1045 A reference to the dataset identified by the given arguments. 

1046 This can be the same dataset reference as given if it was 

1047 resolved. 

1048 

1049 Raises 

1050 ------ 

1051 LookupError 

1052 Raised if no matching dataset exists in the `Registry` (and 

1053 ``predict`` is `False`). 

1054 ValueError 

1055 Raised if a resolved `DatasetRef` was passed as an input, but it 

1056 differs from the one found in the registry. 

1057 TypeError 

1058 Raised if no collections were provided. 

1059 """ 

1060 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1061 if isinstance(datasetRefOrType, DatasetRef): 

1062 if collections is not None: 

1063 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1064 return datasetRefOrType 

1065 timespan: Optional[Timespan] = None 

1066 

1067 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1068 

1069 if datasetType.isCalibration(): 

1070 # Because this is a calibration dataset, first try to make a 

1071 # standardize the data ID without restricting the dimensions to 

1072 # those of the dataset type requested, because there may be extra 

1073 # dimensions that provide temporal information for a validity-range 

1074 # lookup. 

1075 dataId = DataCoordinate.standardize( 

1076 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1077 ) 

1078 if dataId.graph.temporal: 

1079 dataId = self.registry.expandDataId(dataId) 

1080 timespan = dataId.timespan 

1081 else: 

1082 # Standardize the data ID to just the dimensions of the dataset 

1083 # type instead of letting registry.findDataset do it, so we get the 

1084 # result even if no dataset is found. 

1085 dataId = DataCoordinate.standardize( 

1086 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1087 ) 

1088 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1089 # present in the current collection. 

1090 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1091 if ref is None: 

1092 if predict: 

1093 if run is None: 

1094 run = self.run 

1095 if run is None: 

1096 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1097 return DatasetRef(datasetType, dataId, run=run) 

1098 else: 

1099 if collections is None: 

1100 collections = self.registry.defaults.collections 

1101 raise LookupError( 

1102 f"Dataset {datasetType.name} with data ID {dataId} " 

1103 f"could not be found in collections {collections}." 

1104 ) 

1105 if datasetType != ref.datasetType: 

1106 # If they differ it is because the user explicitly specified 

1107 # a compatible dataset type to this call rather than using the 

1108 # registry definition. The DatasetRef must therefore be recreated 

1109 # using the user definition such that the expected type is 

1110 # returned. 

1111 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1112 

1113 return ref 

1114 

1115 @transactional 

1116 @deprecated( 

1117 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1118 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1119 " were relying on the run parameter to determine the run." 

1120 " Will be removed after v27.0.", 

1121 version="v26.0", 

1122 category=FutureWarning, 

1123 ) 

1124 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1125 # Docstring inherited. 

1126 return self.put(obj, ref) 

1127 

1128 @transactional 

1129 def put( 

1130 self, 

1131 obj: Any, 

1132 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1133 /, 

1134 dataId: Optional[DataId] = None, 

1135 *, 

1136 run: Optional[str] = None, 

1137 **kwargs: Any, 

1138 ) -> DatasetRef: 

1139 """Store and register a dataset. 

1140 

1141 Parameters 

1142 ---------- 

1143 obj : `object` 

1144 The dataset. 

1145 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1146 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1147 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1148 `DatasetRef` is given the run and ID are used directly. 

1149 dataId : `dict` or `DataCoordinate` 

1150 A `dict` of `Dimension` link name, value pairs that label the 

1151 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1152 should be provided as the second argument. 

1153 run : `str`, optional 

1154 The name of the run the dataset should be added to, overriding 

1155 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1156 **kwargs 

1157 Additional keyword arguments used to augment or construct a 

1158 `DataCoordinate`. See `DataCoordinate.standardize` 

1159 parameters. Not used if a resolve `DatasetRef` is provided. 

1160 

1161 Returns 

1162 ------- 

1163 ref : `DatasetRef` 

1164 A reference to the stored dataset, updated with the correct id if 

1165 given. 

1166 

1167 Raises 

1168 ------ 

1169 TypeError 

1170 Raised if the butler is read-only or if no run has been provided. 

1171 """ 

1172 if isinstance(datasetRefOrType, DatasetRef): 

1173 # This is a direct put of predefined DatasetRef. 

1174 log.debug("Butler put direct: %s", datasetRefOrType) 

1175 if run is not None: 

1176 warnings.warn("Run collection is not used for DatasetRef") 

1177 # If registry already has a dataset with the same dataset ID, 

1178 # dataset type and DataId, then _importDatasets will do nothing and 

1179 # just return an original ref. We have to raise in this case, there 

1180 # is a datastore check below for that. 

1181 self.registry._importDatasets([datasetRefOrType], expand=True) 

1182 # Before trying to write to the datastore check that it does not 

1183 # know this dataset. This is prone to races, of course. 

1184 if self.datastore.knows(datasetRefOrType): 

1185 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1186 # Try to write dataset to the datastore, if it fails due to a race 

1187 # with another write, the content of stored data may be 

1188 # unpredictable. 

1189 try: 

1190 self.datastore.put(obj, datasetRefOrType) 

1191 except IntegrityError as e: 

1192 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") 

1193 return datasetRefOrType 

1194 

1195 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1196 if not self.isWriteable(): 

1197 raise TypeError("Butler is read-only.") 

1198 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1199 

1200 # Handle dimension records in dataId 

1201 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1202 

1203 # Add Registry Dataset entry. 

1204 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1205 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1206 self.datastore.put(obj, ref) 

1207 

1208 return ref 

1209 

1210 @deprecated( 

1211 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1212 " Please use Butler.get(). Will be removed after v27.0.", 

1213 version="v26.0", 

1214 category=FutureWarning, 

1215 ) 

1216 def getDirect( 

1217 self, 

1218 ref: DatasetRef, 

1219 *, 

1220 parameters: Optional[Dict[str, Any]] = None, 

1221 storageClass: Optional[Union[StorageClass, str]] = None, 

1222 ) -> Any: 

1223 """Retrieve a stored dataset. 

1224 

1225 Parameters 

1226 ---------- 

1227 ref : `DatasetRef` 

1228 Resolved reference to an already stored dataset. 

1229 parameters : `dict` 

1230 Additional StorageClass-defined options to control reading, 

1231 typically used to efficiently read only a subset of the dataset. 

1232 storageClass : `StorageClass` or `str`, optional 

1233 The storage class to be used to override the Python type 

1234 returned by this method. By default the returned type matches 

1235 the dataset type definition for this dataset. Specifying a 

1236 read `StorageClass` can force a different type to be returned. 

1237 This type must be compatible with the original type. 

1238 

1239 Returns 

1240 ------- 

1241 obj : `object` 

1242 The dataset. 

1243 """ 

1244 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1245 

1246 @deprecated( 

1247 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1248 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1249 version="v26.0", 

1250 category=FutureWarning, 

1251 ) 

1252 def getDirectDeferred( 

1253 self, 

1254 ref: DatasetRef, 

1255 *, 

1256 parameters: Union[dict, None] = None, 

1257 storageClass: str | StorageClass | None = None, 

1258 ) -> DeferredDatasetHandle: 

1259 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1260 from a resolved `DatasetRef`. 

1261 

1262 Parameters 

1263 ---------- 

1264 ref : `DatasetRef` 

1265 Resolved reference to an already stored dataset. 

1266 parameters : `dict` 

1267 Additional StorageClass-defined options to control reading, 

1268 typically used to efficiently read only a subset of the dataset. 

1269 storageClass : `StorageClass` or `str`, optional 

1270 The storage class to be used to override the Python type 

1271 returned by this method. By default the returned type matches 

1272 the dataset type definition for this dataset. Specifying a 

1273 read `StorageClass` can force a different type to be returned. 

1274 This type must be compatible with the original type. 

1275 

1276 Returns 

1277 ------- 

1278 obj : `DeferredDatasetHandle` 

1279 A handle which can be used to retrieve a dataset at a later time. 

1280 

1281 Raises 

1282 ------ 

1283 LookupError 

1284 Raised if no matching dataset exists in the `Registry`. 

1285 """ 

1286 # Check thad dataset actuall exists. 

1287 if not self.datastore.exists(ref): 

1288 raise LookupError(f"Dataset reference {ref} does not exist.") 

1289 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1290 

1291 def getDeferred( 

1292 self, 

1293 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1294 /, 

1295 dataId: Optional[DataId] = None, 

1296 *, 

1297 parameters: Union[dict, None] = None, 

1298 collections: Any = None, 

1299 storageClass: str | StorageClass | None = None, 

1300 **kwargs: Any, 

1301 ) -> DeferredDatasetHandle: 

1302 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1303 after an immediate registry lookup. 

1304 

1305 Parameters 

1306 ---------- 

1307 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1308 When `DatasetRef` the `dataId` should be `None`. 

1309 Otherwise the `DatasetType` or name thereof. 

1310 dataId : `dict` or `DataCoordinate`, optional 

1311 A `dict` of `Dimension` link name, value pairs that label the 

1312 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1313 should be provided as the first argument. 

1314 parameters : `dict` 

1315 Additional StorageClass-defined options to control reading, 

1316 typically used to efficiently read only a subset of the dataset. 

1317 collections : Any, optional 

1318 Collections to be searched, overriding ``self.collections``. 

1319 Can be any of the types supported by the ``collections`` argument 

1320 to butler construction. 

1321 storageClass : `StorageClass` or `str`, optional 

1322 The storage class to be used to override the Python type 

1323 returned by this method. By default the returned type matches 

1324 the dataset type definition for this dataset. Specifying a 

1325 read `StorageClass` can force a different type to be returned. 

1326 This type must be compatible with the original type. 

1327 **kwargs 

1328 Additional keyword arguments used to augment or construct a 

1329 `DataId`. See `DataId` parameters. 

1330 

1331 Returns 

1332 ------- 

1333 obj : `DeferredDatasetHandle` 

1334 A handle which can be used to retrieve a dataset at a later time. 

1335 

1336 Raises 

1337 ------ 

1338 LookupError 

1339 Raised if no matching dataset exists in the `Registry`. 

1340 ValueError 

1341 Raised if a resolved `DatasetRef` was passed as an input, but it 

1342 differs from the one found in the registry. 

1343 TypeError 

1344 Raised if no collections were provided. 

1345 """ 

1346 if isinstance(datasetRefOrType, DatasetRef) and not self.datastore.exists(datasetRefOrType): 

1347 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1348 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1349 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1350 

1351 def get( 

1352 self, 

1353 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1354 /, 

1355 dataId: Optional[DataId] = None, 

1356 *, 

1357 parameters: Optional[Dict[str, Any]] = None, 

1358 collections: Any = None, 

1359 storageClass: Optional[Union[StorageClass, str]] = None, 

1360 **kwargs: Any, 

1361 ) -> Any: 

1362 """Retrieve a stored dataset. 

1363 

1364 Parameters 

1365 ---------- 

1366 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1367 When `DatasetRef` the `dataId` should be `None`. 

1368 Otherwise the `DatasetType` or name thereof. 

1369 If a resolved `DatasetRef`, the associated dataset 

1370 is returned directly without additional querying. 

1371 dataId : `dict` or `DataCoordinate` 

1372 A `dict` of `Dimension` link name, value pairs that label the 

1373 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1374 should be provided as the first argument. 

1375 parameters : `dict` 

1376 Additional StorageClass-defined options to control reading, 

1377 typically used to efficiently read only a subset of the dataset. 

1378 collections : Any, optional 

1379 Collections to be searched, overriding ``self.collections``. 

1380 Can be any of the types supported by the ``collections`` argument 

1381 to butler construction. 

1382 storageClass : `StorageClass` or `str`, optional 

1383 The storage class to be used to override the Python type 

1384 returned by this method. By default the returned type matches 

1385 the dataset type definition for this dataset. Specifying a 

1386 read `StorageClass` can force a different type to be returned. 

1387 This type must be compatible with the original type. 

1388 **kwargs 

1389 Additional keyword arguments used to augment or construct a 

1390 `DataCoordinate`. See `DataCoordinate.standardize` 

1391 parameters. 

1392 

1393 Returns 

1394 ------- 

1395 obj : `object` 

1396 The dataset. 

1397 

1398 Raises 

1399 ------ 

1400 LookupError 

1401 Raised if no matching dataset exists in the `Registry`. 

1402 TypeError 

1403 Raised if no collections were provided. 

1404 

1405 Notes 

1406 ----- 

1407 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1408 this method requires that the given data ID include temporal dimensions 

1409 beyond the dimensions of the dataset type itself, in order to find the 

1410 dataset with the appropriate validity range. For example, a "bias" 

1411 dataset with native dimensions ``{instrument, detector}`` could be 

1412 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1413 ``exposure`` is a temporal dimension. 

1414 """ 

1415 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1416 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1417 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1418 

1419 def getURIs( 

1420 self, 

1421 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1422 /, 

1423 dataId: Optional[DataId] = None, 

1424 *, 

1425 predict: bool = False, 

1426 collections: Any = None, 

1427 run: Optional[str] = None, 

1428 **kwargs: Any, 

1429 ) -> DatasetRefURIs: 

1430 """Returns the URIs associated with the dataset. 

1431 

1432 Parameters 

1433 ---------- 

1434 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1435 When `DatasetRef` the `dataId` should be `None`. 

1436 Otherwise the `DatasetType` or name thereof. 

1437 dataId : `dict` or `DataCoordinate` 

1438 A `dict` of `Dimension` link name, value pairs that label the 

1439 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1440 should be provided as the first argument. 

1441 predict : `bool` 

1442 If `True`, allow URIs to be returned of datasets that have not 

1443 been written. 

1444 collections : Any, optional 

1445 Collections to be searched, overriding ``self.collections``. 

1446 Can be any of the types supported by the ``collections`` argument 

1447 to butler construction. 

1448 run : `str`, optional 

1449 Run to use for predictions, overriding ``self.run``. 

1450 **kwargs 

1451 Additional keyword arguments used to augment or construct a 

1452 `DataCoordinate`. See `DataCoordinate.standardize` 

1453 parameters. 

1454 

1455 Returns 

1456 ------- 

1457 uris : `DatasetRefURIs` 

1458 The URI to the primary artifact associated with this dataset (if 

1459 the dataset was disassembled within the datastore this may be 

1460 `None`), and the URIs to any components associated with the dataset 

1461 artifact. (can be empty if there are no components). 

1462 """ 

1463 ref = self._findDatasetRef( 

1464 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1465 ) 

1466 return self.datastore.getURIs(ref, predict) 

1467 

1468 def getURI( 

1469 self, 

1470 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1471 /, 

1472 dataId: Optional[DataId] = None, 

1473 *, 

1474 predict: bool = False, 

1475 collections: Any = None, 

1476 run: Optional[str] = None, 

1477 **kwargs: Any, 

1478 ) -> ResourcePath: 

1479 """Return the URI to the Dataset. 

1480 

1481 Parameters 

1482 ---------- 

1483 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1484 When `DatasetRef` the `dataId` should be `None`. 

1485 Otherwise the `DatasetType` or name thereof. 

1486 dataId : `dict` or `DataCoordinate` 

1487 A `dict` of `Dimension` link name, value pairs that label the 

1488 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1489 should be provided as the first argument. 

1490 predict : `bool` 

1491 If `True`, allow URIs to be returned of datasets that have not 

1492 been written. 

1493 collections : Any, optional 

1494 Collections to be searched, overriding ``self.collections``. 

1495 Can be any of the types supported by the ``collections`` argument 

1496 to butler construction. 

1497 run : `str`, optional 

1498 Run to use for predictions, overriding ``self.run``. 

1499 **kwargs 

1500 Additional keyword arguments used to augment or construct a 

1501 `DataCoordinate`. See `DataCoordinate.standardize` 

1502 parameters. 

1503 

1504 Returns 

1505 ------- 

1506 uri : `lsst.resources.ResourcePath` 

1507 URI pointing to the Dataset within the datastore. If the 

1508 Dataset does not exist in the datastore, and if ``predict`` is 

1509 `True`, the URI will be a prediction and will include a URI 

1510 fragment "#predicted". 

1511 If the datastore does not have entities that relate well 

1512 to the concept of a URI the returned URI string will be 

1513 descriptive. The returned URI is not guaranteed to be obtainable. 

1514 

1515 Raises 

1516 ------ 

1517 LookupError 

1518 A URI has been requested for a dataset that does not exist and 

1519 guessing is not allowed. 

1520 ValueError 

1521 Raised if a resolved `DatasetRef` was passed as an input, but it 

1522 differs from the one found in the registry. 

1523 TypeError 

1524 Raised if no collections were provided. 

1525 RuntimeError 

1526 Raised if a URI is requested for a dataset that consists of 

1527 multiple artifacts. 

1528 """ 

1529 primary, components = self.getURIs( 

1530 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1531 ) 

1532 

1533 if primary is None or components: 

1534 raise RuntimeError( 

1535 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1536 "Use Butler.getURIs() instead." 

1537 ) 

1538 return primary 

1539 

1540 def retrieveArtifacts( 

1541 self, 

1542 refs: Iterable[DatasetRef], 

1543 destination: ResourcePathExpression, 

1544 transfer: str = "auto", 

1545 preserve_path: bool = True, 

1546 overwrite: bool = False, 

1547 ) -> List[ResourcePath]: 

1548 """Retrieve the artifacts associated with the supplied refs. 

1549 

1550 Parameters 

1551 ---------- 

1552 refs : iterable of `DatasetRef` 

1553 The datasets for which artifacts are to be retrieved. 

1554 A single ref can result in multiple artifacts. The refs must 

1555 be resolved. 

1556 destination : `lsst.resources.ResourcePath` or `str` 

1557 Location to write the artifacts. 

1558 transfer : `str`, optional 

1559 Method to use to transfer the artifacts. Must be one of the options 

1560 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1561 "move" is not allowed. 

1562 preserve_path : `bool`, optional 

1563 If `True` the full path of the artifact within the datastore 

1564 is preserved. If `False` the final file component of the path 

1565 is used. 

1566 overwrite : `bool`, optional 

1567 If `True` allow transfers to overwrite existing files at the 

1568 destination. 

1569 

1570 Returns 

1571 ------- 

1572 targets : `list` of `lsst.resources.ResourcePath` 

1573 URIs of file artifacts in destination location. Order is not 

1574 preserved. 

1575 

1576 Notes 

1577 ----- 

1578 For non-file datastores the artifacts written to the destination 

1579 may not match the representation inside the datastore. For example 

1580 a hierarchical data structure in a NoSQL database may well be stored 

1581 as a JSON file. 

1582 """ 

1583 return self.datastore.retrieveArtifacts( 

1584 refs, 

1585 ResourcePath(destination), 

1586 transfer=transfer, 

1587 preserve_path=preserve_path, 

1588 overwrite=overwrite, 

1589 ) 

1590 

1591 def datasetExists( 

1592 self, 

1593 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1594 dataId: Optional[DataId] = None, 

1595 *, 

1596 collections: Any = None, 

1597 **kwargs: Any, 

1598 ) -> bool: 

1599 """Return True if the Dataset is actually present in the Datastore. 

1600 

1601 Parameters 

1602 ---------- 

1603 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1604 When `DatasetRef` the `dataId` should be `None`. 

1605 Otherwise the `DatasetType` or name thereof. 

1606 dataId : `dict` or `DataCoordinate` 

1607 A `dict` of `Dimension` link name, value pairs that label the 

1608 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1609 should be provided as the first argument. 

1610 collections : Any, optional 

1611 Collections to be searched, overriding ``self.collections``. 

1612 Can be any of the types supported by the ``collections`` argument 

1613 to butler construction. 

1614 **kwargs 

1615 Additional keyword arguments used to augment or construct a 

1616 `DataCoordinate`. See `DataCoordinate.standardize` 

1617 parameters. 

1618 

1619 Raises 

1620 ------ 

1621 LookupError 

1622 Raised if the dataset is not even present in the Registry. 

1623 ValueError 

1624 Raised if a resolved `DatasetRef` was passed as an input, but it 

1625 differs from the one found in the registry. 

1626 TypeError 

1627 Raised if no collections were provided. 

1628 """ 

1629 # A resolved ref may be given that is not known to this butler. 

1630 if isinstance(datasetRefOrType, DatasetRef): 

1631 ref = self.registry.getDataset(datasetRefOrType.id) 

1632 if ref is None: 

1633 raise LookupError( 

1634 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1635 ) 

1636 else: 

1637 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1638 return self.datastore.exists(ref) 

1639 

1640 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1641 """Remove one or more `~CollectionType.RUN` collections and the 

1642 datasets within them. 

1643 

1644 Parameters 

1645 ---------- 

1646 names : `Iterable` [ `str` ] 

1647 The names of the collections to remove. 

1648 unstore : `bool`, optional 

1649 If `True` (default), delete datasets from all datastores in which 

1650 they are present, and attempt to rollback the registry deletions if 

1651 datastore deletions fail (which may not always be possible). If 

1652 `False`, datastore records for these datasets are still removed, 

1653 but any artifacts (e.g. files) will not be. 

1654 

1655 Raises 

1656 ------ 

1657 TypeError 

1658 Raised if one or more collections are not of type 

1659 `~CollectionType.RUN`. 

1660 """ 

1661 if not self.isWriteable(): 

1662 raise TypeError("Butler is read-only.") 

1663 names = list(names) 

1664 refs: List[DatasetRef] = [] 

1665 for name in names: 

1666 collectionType = self.registry.getCollectionType(name) 

1667 if collectionType is not CollectionType.RUN: 

1668 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1669 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1670 with self.datastore.transaction(): 

1671 with self.registry.transaction(): 

1672 if unstore: 

1673 self.datastore.trash(refs) 

1674 else: 

1675 self.datastore.forget(refs) 

1676 for name in names: 

1677 self.registry.removeCollection(name) 

1678 if unstore: 

1679 # Point of no return for removing artifacts 

1680 self.datastore.emptyTrash() 

1681 

1682 def pruneDatasets( 

1683 self, 

1684 refs: Iterable[DatasetRef], 

1685 *, 

1686 disassociate: bool = True, 

1687 unstore: bool = False, 

1688 tags: Iterable[str] = (), 

1689 purge: bool = False, 

1690 ) -> None: 

1691 # docstring inherited from LimitedButler 

1692 

1693 if not self.isWriteable(): 

1694 raise TypeError("Butler is read-only.") 

1695 if purge: 

1696 if not disassociate: 

1697 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1698 if not unstore: 

1699 raise TypeError("Cannot pass purge=True without unstore=True.") 

1700 elif disassociate: 

1701 tags = tuple(tags) 

1702 if not tags: 

1703 raise TypeError("No tags provided but disassociate=True.") 

1704 for tag in tags: 

1705 collectionType = self.registry.getCollectionType(tag) 

1706 if collectionType is not CollectionType.TAGGED: 

1707 raise TypeError( 

1708 f"Cannot disassociate from collection '{tag}' " 

1709 f"of non-TAGGED type {collectionType.name}." 

1710 ) 

1711 # Transform possibly-single-pass iterable into something we can iterate 

1712 # over multiple times. 

1713 refs = list(refs) 

1714 # Pruning a component of a DatasetRef makes no sense since registry 

1715 # doesn't know about components and datastore might not store 

1716 # components in a separate file 

1717 for ref in refs: 

1718 if ref.datasetType.component(): 

1719 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1720 # We don't need an unreliable Datastore transaction for this, because 

1721 # we've been extra careful to ensure that Datastore.trash only involves 

1722 # mutating the Registry (it can _look_ at Datastore-specific things, 

1723 # but shouldn't change them), and hence all operations here are 

1724 # Registry operations. 

1725 with self.datastore.transaction(): 

1726 with self.registry.transaction(): 

1727 if unstore: 

1728 self.datastore.trash(refs) 

1729 if purge: 

1730 self.registry.removeDatasets(refs) 

1731 elif disassociate: 

1732 assert tags, "Guaranteed by earlier logic in this function." 

1733 for tag in tags: 

1734 self.registry.disassociate(tag, refs) 

1735 # We've exited the Registry transaction, and apparently committed. 

1736 # (if there was an exception, everything rolled back, and it's as if 

1737 # nothing happened - and we never get here). 

1738 # Datastore artifacts are not yet gone, but they're clearly marked 

1739 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1740 # problems we can try again later, and if manual administrative 

1741 # intervention is required, it's pretty clear what that should entail: 

1742 # deleting everything on disk and in private Datastore tables that is 

1743 # in the dataset_location_trash table. 

1744 if unstore: 

1745 # Point of no return for removing artifacts 

1746 self.datastore.emptyTrash() 

1747 

1748 @transactional 

1749 def ingest( 

1750 self, 

1751 *datasets: FileDataset, 

1752 transfer: Optional[str] = "auto", 

1753 run: Optional[str] = None, 

1754 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1755 record_validation_info: bool = True, 

1756 ) -> None: 

1757 """Store and register one or more datasets that already exist on disk. 

1758 

1759 Parameters 

1760 ---------- 

1761 datasets : `FileDataset` 

1762 Each positional argument is a struct containing information about 

1763 a file to be ingested, including its URI (either absolute or 

1764 relative to the datastore root, if applicable), a resolved 

1765 `DatasetRef`, and optionally a formatter class or its 

1766 fully-qualified string name. If a formatter is not provided, the 

1767 formatter that would be used for `put` is assumed. On successful 

1768 ingest all `FileDataset.formatter` attributes will be set to the 

1769 formatter class used. `FileDataset.path` attributes may be modified 

1770 to put paths in whatever the datastore considers a standardized 

1771 form. 

1772 transfer : `str`, optional 

1773 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1774 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1775 transfer the file. 

1776 run : `str`, optional 

1777 The name of the run ingested datasets should be added to, 

1778 overriding ``self.run``. This parameter is now deprecated since 

1779 the run is encoded in the ``FileDataset``. 

1780 idGenerationMode : `DatasetIdGenEnum`, optional 

1781 Specifies option for generating dataset IDs. By default unique IDs 

1782 are generated for each inserted dataset. 

1783 record_validation_info : `bool`, optional 

1784 If `True`, the default, the datastore can record validation 

1785 information associated with the file. If `False` the datastore 

1786 will not attempt to track any information such as checksums 

1787 or file sizes. This can be useful if such information is tracked 

1788 in an external system or if the file is to be compressed in place. 

1789 It is up to the datastore whether this parameter is relevant. 

1790 

1791 Raises 

1792 ------ 

1793 TypeError 

1794 Raised if the butler is read-only or if no run was provided. 

1795 NotImplementedError 

1796 Raised if the `Datastore` does not support the given transfer mode. 

1797 DatasetTypeNotSupportedError 

1798 Raised if one or more files to be ingested have a dataset type that 

1799 is not supported by the `Datastore`.. 

1800 FileNotFoundError 

1801 Raised if one of the given files does not exist. 

1802 FileExistsError 

1803 Raised if transfer is not `None` but the (internal) location the 

1804 file would be moved to is already occupied. 

1805 

1806 Notes 

1807 ----- 

1808 This operation is not fully exception safe: if a database operation 

1809 fails, the given `FileDataset` instances may be only partially updated. 

1810 

1811 It is atomic in terms of database operations (they will either all 

1812 succeed or all fail) providing the database engine implements 

1813 transactions correctly. It will attempt to be atomic in terms of 

1814 filesystem operations as well, but this cannot be implemented 

1815 rigorously for most datastores. 

1816 """ 

1817 if not self.isWriteable(): 

1818 raise TypeError("Butler is read-only.") 

1819 

1820 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1821 if not datasets: 

1822 return 

1823 

1824 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1825 

1826 # We need to reorganize all the inputs so that they are grouped 

1827 # by dataset type and run. Multiple refs in a single FileDataset 

1828 # are required to share the run and dataset type. 

1829 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

1830 groupedData: GroupedData = defaultdict(list) 

1831 

1832 # Track DataIDs that are being ingested so we can spot issues early 

1833 # with duplication. Retain previous FileDataset so we can report it. 

1834 groupedDataIds: MutableMapping[ 

1835 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1836 ] = defaultdict(dict) 

1837 

1838 used_run = False 

1839 

1840 # And the nested loop that populates it: 

1841 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1842 # Somewhere to store pre-existing refs if we have an 

1843 # execution butler. 

1844 existingRefs: List[DatasetRef] = [] 

1845 

1846 for ref in dataset.refs: 

1847 assert ref.run is not None # For mypy 

1848 group_key = (ref.datasetType, ref.run) 

1849 

1850 if ref.dataId in groupedDataIds[group_key]: 

1851 raise ConflictingDefinitionError( 

1852 f"Ingest conflict. Dataset {dataset.path} has same" 

1853 " DataId as other ingest dataset" 

1854 f" {groupedDataIds[group_key][ref.dataId].path} " 

1855 f" ({ref.dataId})" 

1856 ) 

1857 

1858 groupedDataIds[group_key][ref.dataId] = dataset 

1859 

1860 if existingRefs: 

1861 if len(dataset.refs) != len(existingRefs): 

1862 # Keeping track of partially pre-existing datasets is hard 

1863 # and should generally never happen. For now don't allow 

1864 # it. 

1865 raise ConflictingDefinitionError( 

1866 f"For dataset {dataset.path} some dataIds already exist" 

1867 " in registry but others do not. This is not supported." 

1868 ) 

1869 

1870 # Store expanded form in the original FileDataset. 

1871 dataset.refs = existingRefs 

1872 else: 

1873 groupedData[group_key].append(dataset) 

1874 

1875 if not used_run and run is not None: 

1876 warnings.warn( 

1877 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

1878 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

1879 category=FutureWarning, 

1880 stacklevel=3, # Take into account the @transactional decorator. 

1881 ) 

1882 

1883 # Now we can bulk-insert into Registry for each DatasetType. 

1884 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1885 groupedData.items(), desc="Bulk-inserting datasets by type" 

1886 ): 

1887 refs_to_import = [] 

1888 for dataset in grouped_datasets: 

1889 refs_to_import.extend(dataset.refs) 

1890 

1891 n_refs = len(refs_to_import) 

1892 log.verbose( 

1893 "Importing %d ref%s of dataset type %r into run %r", 

1894 n_refs, 

1895 "" if n_refs == 1 else "s", 

1896 datasetType.name, 

1897 this_run, 

1898 ) 

1899 

1900 # Import the refs and expand the DataCoordinates since we can't 

1901 # guarantee that they are expanded and Datastore will need 

1902 # the records. 

1903 imported_refs = self.registry._importDatasets(refs_to_import, expand=True) 

1904 assert set(imported_refs) == set(refs_to_import) 

1905 

1906 # Replace all the refs in the FileDataset with expanded versions. 

1907 # Pull them off in the order we put them on the list. 

1908 for dataset in grouped_datasets: 

1909 n_dataset_refs = len(dataset.refs) 

1910 dataset.refs = imported_refs[:n_dataset_refs] 

1911 del imported_refs[:n_dataset_refs] 

1912 

1913 # Bulk-insert everything into Datastore. 

1914 # We do not know if any of the registry entries already existed 

1915 # (_importDatasets only complains if they exist but differ) so 

1916 # we have to catch IntegrityError explicitly. 

1917 try: 

1918 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1919 except IntegrityError as e: 

1920 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") 

1921 

1922 @contextlib.contextmanager 

1923 def export( 

1924 self, 

1925 *, 

1926 directory: Optional[str] = None, 

1927 filename: Optional[str] = None, 

1928 format: Optional[str] = None, 

1929 transfer: Optional[str] = None, 

1930 ) -> Iterator[RepoExportContext]: 

1931 """Export datasets from the repository represented by this `Butler`. 

1932 

1933 This method is a context manager that returns a helper object 

1934 (`RepoExportContext`) that is used to indicate what information from 

1935 the repository should be exported. 

1936 

1937 Parameters 

1938 ---------- 

1939 directory : `str`, optional 

1940 Directory dataset files should be written to if ``transfer`` is not 

1941 `None`. 

1942 filename : `str`, optional 

1943 Name for the file that will include database information associated 

1944 with the exported datasets. If this is not an absolute path and 

1945 ``directory`` is not `None`, it will be written to ``directory`` 

1946 instead of the current working directory. Defaults to 

1947 "export.{format}". 

1948 format : `str`, optional 

1949 File format for the database information file. If `None`, the 

1950 extension of ``filename`` will be used. 

1951 transfer : `str`, optional 

1952 Transfer mode passed to `Datastore.export`. 

1953 

1954 Raises 

1955 ------ 

1956 TypeError 

1957 Raised if the set of arguments passed is inconsistent. 

1958 

1959 Examples 

1960 -------- 

1961 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1962 methods are used to provide the iterables over data IDs and/or datasets 

1963 to be exported:: 

1964 

1965 with butler.export("exports.yaml") as export: 

1966 # Export all flats, but none of the dimension element rows 

1967 # (i.e. data ID information) associated with them. 

1968 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1969 elements=()) 

1970 # Export all datasets that start with "deepCoadd_" and all of 

1971 # their associated data ID information. 

1972 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1973 """ 

1974 if directory is None and transfer is not None: 

1975 raise TypeError("Cannot transfer without providing a directory.") 

1976 if transfer == "move": 

1977 raise TypeError("Transfer may not be 'move': export is read-only") 

1978 if format is None: 

1979 if filename is None: 

1980 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1981 else: 

1982 _, format = os.path.splitext(filename) 

1983 if not format: 

1984 raise ValueError("Please specify a file extension to determine export format.") 

1985 format = format[1:] # Strip leading "."" 

1986 elif filename is None: 

1987 filename = f"export.{format}" 

1988 if directory is not None: 

1989 filename = os.path.join(directory, filename) 

1990 formats = self._config["repo_transfer_formats"] 

1991 if format not in formats: 

1992 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1993 BackendClass = get_class_of(formats[format, "export"]) 

1994 with open(filename, "w") as stream: 

1995 backend = BackendClass(stream, universe=self.registry.dimensions) 

1996 try: 

1997 helper = RepoExportContext( 

1998 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

1999 ) 

2000 yield helper 

2001 except BaseException: 

2002 raise 

2003 else: 

2004 helper._finish() 

2005 

2006 def import_( 

2007 self, 

2008 *, 

2009 directory: Optional[ResourcePathExpression] = None, 

2010 filename: Union[ResourcePathExpression, TextIO, None] = None, 

2011 format: Optional[str] = None, 

2012 transfer: Optional[str] = None, 

2013 skip_dimensions: Optional[Set] = None, 

2014 ) -> None: 

2015 """Import datasets into this repository that were exported from a 

2016 different butler repository via `~lsst.daf.butler.Butler.export`. 

2017 

2018 Parameters 

2019 ---------- 

2020 directory : `~lsst.resources.ResourcePathExpression`, optional 

2021 Directory containing dataset files to import from. If `None`, 

2022 ``filename`` and all dataset file paths specified therein must 

2023 be absolute. 

2024 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2025 A stream or name of file that contains database information 

2026 associated with the exported datasets, typically generated by 

2027 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2028 `~lsst.resources.ResourcePath` and is not an absolute path, 

2029 it will first be looked for relative to ``directory`` and if not 

2030 found there it will be looked for in the current working 

2031 directory. Defaults to "export.{format}". 

2032 format : `str`, optional 

2033 File format for ``filename``. If `None`, the extension of 

2034 ``filename`` will be used. 

2035 transfer : `str`, optional 

2036 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2037 skip_dimensions : `set`, optional 

2038 Names of dimensions that should be skipped and not imported. 

2039 

2040 Raises 

2041 ------ 

2042 TypeError 

2043 Raised if the set of arguments passed is inconsistent, or if the 

2044 butler is read-only. 

2045 """ 

2046 if not self.isWriteable(): 

2047 raise TypeError("Butler is read-only.") 

2048 if format is None: 

2049 if filename is None: 

2050 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2051 else: 

2052 _, format = os.path.splitext(filename) # type: ignore 

2053 elif filename is None: 

2054 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2055 if directory is not None: 

2056 directory = ResourcePath(directory, forceDirectory=True) 

2057 # mypy doesn't think this will work but it does in python >= 3.10. 

2058 if isinstance(filename, ResourcePathExpression): # type: ignore 

2059 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2060 if not filename.isabs() and directory is not None: 

2061 potential = directory.join(filename) 

2062 exists_in_cwd = filename.exists() 

2063 exists_in_dir = potential.exists() 

2064 if exists_in_cwd and exists_in_dir: 

2065 log.warning( 

2066 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2067 "Additionally, the file exists relative to the given search directory (%s). " 

2068 "Using the export file in the given directory.", 

2069 filename, 

2070 potential, 

2071 ) 

2072 # Given they specified an explicit directory and that 

2073 # directory has the export file in it, assume that that 

2074 # is what was meant despite the file in cwd. 

2075 filename = potential 

2076 elif exists_in_dir: 

2077 filename = potential 

2078 elif not exists_in_cwd and not exists_in_dir: 

2079 # Raise early. 

2080 raise FileNotFoundError( 

2081 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2082 ) 

2083 BackendClass: type[RepoImportBackend] = get_class_of( 

2084 self._config["repo_transfer_formats"][format]["import"] 

2085 ) 

2086 

2087 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2088 backend = BackendClass(importStream, self.registry) # type: ignore[call-arg] 

2089 backend.register() 

2090 with self.transaction(): 

2091 backend.load( 

2092 self.datastore, 

2093 directory=directory, 

2094 transfer=transfer, 

2095 skip_dimensions=skip_dimensions, 

2096 ) 

2097 

2098 if isinstance(filename, ResourcePath): 

2099 # We can not use open() here at the moment because of 

2100 # DM-38589 since yaml does stream.read(8192) in a loop. 

2101 stream = io.StringIO(filename.read().decode()) 

2102 doImport(stream) 

2103 else: 

2104 doImport(filename) # type: ignore 

2105 

2106 def transfer_from( 

2107 self, 

2108 source_butler: LimitedButler, 

2109 source_refs: Iterable[DatasetRef], 

2110 transfer: str = "auto", 

2111 skip_missing: bool = True, 

2112 register_dataset_types: bool = False, 

2113 transfer_dimensions: bool = False, 

2114 ) -> collections.abc.Collection[DatasetRef]: 

2115 """Transfer datasets to this Butler from a run in another Butler. 

2116 

2117 Parameters 

2118 ---------- 

2119 source_butler : `LimitedButler` 

2120 Butler from which the datasets are to be transferred. If data IDs 

2121 in ``source_refs`` are not expanded then this has to be a full 

2122 `Butler` whose registry will be used to expand data IDs. 

2123 source_refs : iterable of `DatasetRef` 

2124 Datasets defined in the source butler that should be transferred to 

2125 this butler. 

2126 transfer : `str`, optional 

2127 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2128 skip_missing : `bool` 

2129 If `True`, datasets with no datastore artifact associated with 

2130 them are not transferred. If `False` a registry entry will be 

2131 created even if no datastore record is created (and so will 

2132 look equivalent to the dataset being unstored). 

2133 register_dataset_types : `bool` 

2134 If `True` any missing dataset types are registered. Otherwise 

2135 an exception is raised. 

2136 transfer_dimensions : `bool`, optional 

2137 If `True`, dimension record data associated with the new datasets 

2138 will be transferred. 

2139 

2140 Returns 

2141 ------- 

2142 refs : `list` of `DatasetRef` 

2143 The refs added to this Butler. 

2144 

2145 Notes 

2146 ----- 

2147 The datastore artifact has to exist for a transfer 

2148 to be made but non-existence is not an error. 

2149 

2150 Datasets that already exist in this run will be skipped. 

2151 

2152 The datasets are imported as part of a transaction, although 

2153 dataset types are registered before the transaction is started. 

2154 This means that it is possible for a dataset type to be registered 

2155 even though transfer has failed. 

2156 """ 

2157 if not self.isWriteable(): 

2158 raise TypeError("Butler is read-only.") 

2159 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2160 

2161 # Will iterate through the refs multiple times so need to convert 

2162 # to a list if this isn't a collection. 

2163 if not isinstance(source_refs, collections.abc.Collection): 

2164 source_refs = list(source_refs) 

2165 

2166 original_count = len(source_refs) 

2167 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2168 

2169 # In some situations the datastore artifact may be missing 

2170 # and we do not want that registry entry to be imported. 

2171 # Asking datastore is not sufficient, the records may have been 

2172 # purged, we have to ask for the (predicted) URI and check 

2173 # existence explicitly. Execution butler is set up exactly like 

2174 # this with no datastore records. 

2175 artifact_existence: Dict[ResourcePath, bool] = {} 

2176 if skip_missing: 

2177 dataset_existence = source_butler.datastore.mexists( 

2178 source_refs, artifact_existence=artifact_existence 

2179 ) 

2180 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2181 filtered_count = len(source_refs) 

2182 n_missing = original_count - filtered_count 

2183 log.verbose( 

2184 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2185 n_missing, 

2186 "" if n_missing == 1 else "s", 

2187 filtered_count, 

2188 ) 

2189 

2190 # Importing requires that we group the refs by dataset type and run 

2191 # before doing the import. 

2192 source_dataset_types = set() 

2193 grouped_refs = defaultdict(list) 

2194 for ref in source_refs: 

2195 grouped_refs[ref.datasetType, ref.run].append(ref) 

2196 source_dataset_types.add(ref.datasetType) 

2197 

2198 # Check to see if the dataset type in the source butler has 

2199 # the same definition in the target butler and register missing 

2200 # ones if requested. Registration must happen outside a transaction. 

2201 newly_registered_dataset_types = set() 

2202 for datasetType in source_dataset_types: 

2203 if register_dataset_types: 

2204 # Let this raise immediately if inconsistent. Continuing 

2205 # on to find additional inconsistent dataset types 

2206 # might result in additional unwanted dataset types being 

2207 # registered. 

2208 if self.registry.registerDatasetType(datasetType): 

2209 newly_registered_dataset_types.add(datasetType) 

2210 else: 

2211 # If the dataset type is missing, let it fail immediately. 

2212 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2213 if target_dataset_type != datasetType: 

2214 raise ConflictingDefinitionError( 

2215 "Source butler dataset type differs from definition" 

2216 f" in target butler: {datasetType} !=" 

2217 f" {target_dataset_type}" 

2218 ) 

2219 if newly_registered_dataset_types: 

2220 # We may have registered some even if there were inconsistencies 

2221 # but should let people know (or else remove them again). 

2222 log.log( 

2223 VERBOSE, 

2224 "Registered the following dataset types in the target Butler: %s", 

2225 ", ".join(d.name for d in newly_registered_dataset_types), 

2226 ) 

2227 else: 

2228 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2229 

2230 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2231 if transfer_dimensions: 

2232 # Collect all the dimension records for these refs. 

2233 # All dimensions are to be copied but the list of valid dimensions 

2234 # come from this butler's universe. 

2235 elements = frozenset( 

2236 element 

2237 for element in self.registry.dimensions.getStaticElements() 

2238 if element.hasTable() and element.viewOf is None 

2239 ) 

2240 dataIds = set(ref.dataId for ref in source_refs) 

2241 # This logic comes from saveDataIds. 

2242 for dataId in dataIds: 

2243 # Need an expanded record, if not expanded that we need a full 

2244 # butler with registry (allow mocks with registry too). 

2245 if not dataId.hasRecords(): 

2246 if registry := getattr(source_butler, "registry", None): 

2247 dataId = registry.expandDataId(dataId) 

2248 else: 

2249 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2250 # If this butler doesn't know about a dimension in the source 

2251 # butler things will break later. 

2252 for record in dataId.records.values(): 

2253 if record is not None and record.definition in elements: 

2254 dimension_records[record.definition].setdefault(record.dataId, record) 

2255 

2256 handled_collections: Set[str] = set() 

2257 

2258 # Do all the importing in a single transaction. 

2259 with self.transaction(): 

2260 if dimension_records: 

2261 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2262 for element, r in dimension_records.items(): 

2263 records = [r[dataId] for dataId in r] 

2264 # Assume that if the record is already present that we can 

2265 # use it without having to check that the record metadata 

2266 # is consistent. 

2267 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2268 

2269 n_imported = 0 

2270 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2271 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2272 ): 

2273 if run not in handled_collections: 

2274 # May need to create output collection. If source butler 

2275 # has a registry, ask for documentation string. 

2276 run_doc = None 

2277 if registry := getattr(source_butler, "registry", None): 

2278 run_doc = registry.getCollectionDocumentation(run) 

2279 registered = self.registry.registerRun(run, doc=run_doc) 

2280 handled_collections.add(run) 

2281 if registered: 

2282 log.log(VERBOSE, "Creating output run %s", run) 

2283 

2284 n_refs = len(refs_to_import) 

2285 log.verbose( 

2286 "Importing %d ref%s of dataset type %s into run %s", 

2287 n_refs, 

2288 "" if n_refs == 1 else "s", 

2289 datasetType.name, 

2290 run, 

2291 ) 

2292 

2293 # Assume we are using UUIDs and the source refs will match 

2294 # those imported. 

2295 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2296 assert set(imported_refs) == set(refs_to_import) 

2297 n_imported += len(imported_refs) 

2298 

2299 assert len(source_refs) == n_imported 

2300 log.verbose("Imported %d datasets into destination butler", n_imported) 

2301 

2302 # Ask the datastore to transfer. The datastore has to check that 

2303 # the source datastore is compatible with the target datastore. 

2304 accepted, rejected = self.datastore.transfer_from( 

2305 source_butler.datastore, 

2306 source_refs, 

2307 transfer=transfer, 

2308 artifact_existence=artifact_existence, 

2309 ) 

2310 if rejected: 

2311 # For now, accept the registry entries but not the files. 

2312 log.warning( 

2313 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2314 len(rejected), 

2315 len(accepted), 

2316 datasetType, 

2317 run, 

2318 ) 

2319 

2320 return source_refs 

2321 

2322 def validateConfiguration( 

2323 self, 

2324 logFailures: bool = False, 

2325 datasetTypeNames: Optional[Iterable[str]] = None, 

2326 ignore: Iterable[str] | None = None, 

2327 ) -> None: 

2328 """Validate butler configuration. 

2329 

2330 Checks that each `DatasetType` can be stored in the `Datastore`. 

2331 

2332 Parameters 

2333 ---------- 

2334 logFailures : `bool`, optional 

2335 If `True`, output a log message for every validation error 

2336 detected. 

2337 datasetTypeNames : iterable of `str`, optional 

2338 The `DatasetType` names that should be checked. This allows 

2339 only a subset to be selected. 

2340 ignore : iterable of `str`, optional 

2341 Names of DatasetTypes to skip over. This can be used to skip 

2342 known problems. If a named `DatasetType` corresponds to a 

2343 composite, all components of that `DatasetType` will also be 

2344 ignored. 

2345 

2346 Raises 

2347 ------ 

2348 ButlerValidationError 

2349 Raised if there is some inconsistency with how this Butler 

2350 is configured. 

2351 """ 

2352 if datasetTypeNames: 

2353 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2354 else: 

2355 datasetTypes = list(self.registry.queryDatasetTypes()) 

2356 

2357 # filter out anything from the ignore list 

2358 if ignore: 

2359 ignore = set(ignore) 

2360 datasetTypes = [ 

2361 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2362 ] 

2363 else: 

2364 ignore = set() 

2365 

2366 # Find all the registered instruments 

2367 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2368 

2369 # For each datasetType that has an instrument dimension, create 

2370 # a DatasetRef for each defined instrument 

2371 datasetRefs = [] 

2372 

2373 for datasetType in datasetTypes: 

2374 if "instrument" in datasetType.dimensions: 

2375 for instrument in instruments: 

2376 datasetRef = DatasetRef( 

2377 datasetType, 

2378 {"instrument": instrument}, # type: ignore 

2379 conform=False, 

2380 run="validate", 

2381 ) 

2382 datasetRefs.append(datasetRef) 

2383 

2384 entities: List[Union[DatasetType, DatasetRef]] = [] 

2385 entities.extend(datasetTypes) 

2386 entities.extend(datasetRefs) 

2387 

2388 datastoreErrorStr = None 

2389 try: 

2390 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2391 except ValidationError as e: 

2392 datastoreErrorStr = str(e) 

2393 

2394 # Also check that the LookupKeys used by the datastores match 

2395 # registry and storage class definitions 

2396 keys = self.datastore.getLookupKeys() 

2397 

2398 failedNames = set() 

2399 failedDataId = set() 

2400 for key in keys: 

2401 if key.name is not None: 

2402 if key.name in ignore: 

2403 continue 

2404 

2405 # skip if specific datasetType names were requested and this 

2406 # name does not match 

2407 if datasetTypeNames and key.name not in datasetTypeNames: 

2408 continue 

2409 

2410 # See if it is a StorageClass or a DatasetType 

2411 if key.name in self.storageClasses: 

2412 pass 

2413 else: 

2414 try: 

2415 self.registry.getDatasetType(key.name) 

2416 except KeyError: 

2417 if logFailures: 

2418 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2419 failedNames.add(key) 

2420 else: 

2421 # Dimensions are checked for consistency when the Butler 

2422 # is created and rendezvoused with a universe. 

2423 pass 

2424 

2425 # Check that the instrument is a valid instrument 

2426 # Currently only support instrument so check for that 

2427 if key.dataId: 

2428 dataIdKeys = set(key.dataId) 

2429 if set(["instrument"]) != dataIdKeys: 

2430 if logFailures: 

2431 log.critical("Key '%s' has unsupported DataId override", key) 

2432 failedDataId.add(key) 

2433 elif key.dataId["instrument"] not in instruments: 

2434 if logFailures: 

2435 log.critical("Key '%s' has unknown instrument", key) 

2436 failedDataId.add(key) 

2437 

2438 messages = [] 

2439 

2440 if datastoreErrorStr: 

2441 messages.append(datastoreErrorStr) 

2442 

2443 for failed, msg in ( 

2444 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2445 (failedDataId, "Keys with bad DataId entries: "), 

2446 ): 

2447 if failed: 

2448 msg += ", ".join(str(k) for k in failed) 

2449 messages.append(msg) 

2450 

2451 if messages: 

2452 raise ValidationError(";\n".join(messages)) 

2453 

2454 @property 

2455 def collections(self) -> Sequence[str]: 

2456 """The collections to search by default, in order 

2457 (`Sequence` [ `str` ]). 

2458 

2459 This is an alias for ``self.registry.defaults.collections``. It cannot 

2460 be set directly in isolation, but all defaults may be changed together 

2461 by assigning a new `RegistryDefaults` instance to 

2462 ``self.registry.defaults``. 

2463 """ 

2464 return self.registry.defaults.collections 

2465 

2466 @property 

2467 def run(self) -> Optional[str]: 

2468 """Name of the run this butler writes outputs to by default (`str` or 

2469 `None`). 

2470 

2471 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2472 directly in isolation, but all defaults may be changed together by 

2473 assigning a new `RegistryDefaults` instance to 

2474 ``self.registry.defaults``. 

2475 """ 

2476 return self.registry.defaults.run 

2477 

2478 @property 

2479 def dimensions(self) -> DimensionUniverse: 

2480 # Docstring inherited. 

2481 return self.registry.dimensions 

2482 

2483 registry: Registry 

2484 """The object that manages dataset metadata and relationships (`Registry`). 

2485 

2486 Most operations that don't involve reading or writing butler datasets are 

2487 accessible only via `Registry` methods. 

2488 """ 

2489 

2490 datastore: Datastore 

2491 """The object that manages actual dataset storage (`Datastore`). 

2492 

2493 Direct user access to the datastore should rarely be necessary; the primary 

2494 exception is the case where a `Datastore` implementation provides extra 

2495 functionality beyond what the base class defines. 

2496 """ 

2497 

2498 storageClasses: StorageClassFactory 

2499 """An object that maps known storage class names to objects that fully 

2500 describe them (`StorageClassFactory`). 

2501 """