Coverage for python/lsst/daf/butler/_butler.py: 8%

674 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 02:10 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30) 

31 

32import collections.abc 

33import contextlib 

34import io 

35import logging 

36import numbers 

37import os 

38import warnings 

39from collections import defaultdict 

40from typing import ( 

41 TYPE_CHECKING, 

42 Any, 

43 ClassVar, 

44 Counter, 

45 Dict, 

46 Iterable, 

47 Iterator, 

48 List, 

49 MutableMapping, 

50 Optional, 

51 Sequence, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from deprecated.sphinx import deprecated 

60from lsst.resources import ResourcePath, ResourcePathExpression 

61from lsst.utils import doImportType 

62from lsst.utils.introspection import get_class_of 

63from lsst.utils.logging import VERBOSE, getLogger 

64from sqlalchemy.exc import IntegrityError 

65 

66from ._butlerConfig import ButlerConfig 

67from ._butlerRepoIndex import ButlerRepoIndex 

68from ._deferredDatasetHandle import DeferredDatasetHandle 

69from ._limited_butler import LimitedButler 

70from .core import ( 

71 Config, 

72 ConfigSubset, 

73 DataCoordinate, 

74 DataId, 

75 DataIdValue, 

76 DatasetIdGenEnum, 

77 DatasetRef, 

78 DatasetRefURIs, 

79 DatasetType, 

80 Datastore, 

81 Dimension, 

82 DimensionConfig, 

83 DimensionElement, 

84 DimensionRecord, 

85 DimensionUniverse, 

86 FileDataset, 

87 Progress, 

88 StorageClass, 

89 StorageClassFactory, 

90 Timespan, 

91 ValidationError, 

92) 

93from .core.repoRelocation import BUTLER_ROOT_TAG 

94from .core.utils import transactional 

95from .registry import ( 

96 CollectionType, 

97 ConflictingDefinitionError, 

98 DataIdError, 

99 MissingDatasetTypeError, 

100 Registry, 

101 RegistryConfig, 

102 RegistryDefaults, 

103) 

104from .transfers import RepoExportContext 

105 

106if TYPE_CHECKING: 

107 from lsst.resources import ResourceHandleProtocol 

108 

109 from .transfers import RepoImportBackend 

110 

111log = getLogger(__name__) 

112 

113 

114class ButlerValidationError(ValidationError): 

115 """There is a problem with the Butler configuration.""" 

116 

117 pass 

118 

119 

120class Butler(LimitedButler): 

121 """Main entry point for the data access system. 

122 

123 Parameters 

124 ---------- 

125 config : `ButlerConfig`, `Config` or `str`, optional. 

126 Configuration. Anything acceptable to the 

127 `ButlerConfig` constructor. If a directory path 

128 is given the configuration will be read from a ``butler.yaml`` file in 

129 that location. If `None` is given default values will be used. 

130 butler : `Butler`, optional. 

131 If provided, construct a new Butler that uses the same registry and 

132 datastore as the given one, but with the given collection and run. 

133 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

134 arguments. 

135 collections : `str` or `Iterable` [ `str` ], optional 

136 An expression specifying the collections to be searched (in order) when 

137 reading datasets. 

138 This may be a `str` collection name or an iterable thereof. 

139 See :ref:`daf_butler_collection_expressions` for more information. 

140 These collections are not registered automatically and must be 

141 manually registered before they are used by any method, but they may be 

142 manually registered after the `Butler` is initialized. 

143 run : `str`, optional 

144 Name of the `~CollectionType.RUN` collection new datasets should be 

145 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

146 ``collections`` will be set to ``[run]``. If not `None`, this 

147 collection will automatically be registered. If this is not set (and 

148 ``writeable`` is not set either), a read-only butler will be created. 

149 searchPaths : `list` of `str`, optional 

150 Directory paths to search when calculating the full Butler 

151 configuration. Not used if the supplied config is already a 

152 `ButlerConfig`. 

153 writeable : `bool`, optional 

154 Explicitly sets whether the butler supports write operations. If not 

155 provided, a read-write butler is created if any of ``run``, ``tags``, 

156 or ``chains`` is non-empty. 

157 inferDefaults : `bool`, optional 

158 If `True` (default) infer default data ID values from the values 

159 present in the datasets in ``collections``: if all collections have the 

160 same value (or no value) for a governor dimension, that value will be 

161 the default for that dimension. Nonexistent collections are ignored. 

162 If a default value is provided explicitly for a governor dimension via 

163 ``**kwargs``, no default will be inferred for that dimension. 

164 **kwargs : `str` 

165 Default data ID key-value pairs. These may only identify "governor" 

166 dimensions like ``instrument`` and ``skymap``. 

167 

168 Examples 

169 -------- 

170 While there are many ways to control exactly how a `Butler` interacts with 

171 the collections in its `Registry`, the most common cases are still simple. 

172 

173 For a read-only `Butler` that searches one collection, do:: 

174 

175 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

176 

177 For a read-write `Butler` that writes to and reads from a 

178 `~CollectionType.RUN` collection:: 

179 

180 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

181 

182 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

183 because we want to write to one `~CollectionType.RUN` collection but read 

184 from several others (as well):: 

185 

186 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

187 collections=["u/alice/DM-50000/a", 

188 "u/bob/DM-49998", 

189 "HSC/defaults"]) 

190 

191 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

192 Datasets will be read first from that run (since it appears first in the 

193 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

194 

195 Finally, one can always create a `Butler` with no collections:: 

196 

197 butler = Butler("/path/to/repo", writeable=True) 

198 

199 This can be extremely useful when you just want to use ``butler.registry``, 

200 e.g. for inserting dimension data or managing collections, or when the 

201 collections you want to use with the butler are not consistent. 

202 Passing ``writeable`` explicitly here is only necessary if you want to be 

203 able to make changes to the repo - usually the value for ``writeable`` can 

204 be guessed from the collection arguments provided, but it defaults to 

205 `False` when there are not collection arguments. 

206 """ 

207 

208 def __init__( 

209 self, 

210 config: Union[Config, ResourcePathExpression, None] = None, 

211 *, 

212 butler: Optional[Butler] = None, 

213 collections: Any = None, 

214 run: Optional[str] = None, 

215 searchPaths: Optional[Sequence[ResourcePathExpression]] = None, 

216 writeable: Optional[bool] = None, 

217 inferDefaults: bool = True, 

218 **kwargs: str, 

219 ): 

220 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

221 # Load registry, datastore, etc. from config or existing butler. 

222 if butler is not None: 

223 if config is not None or searchPaths is not None or writeable is not None: 

224 raise TypeError( 

225 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

226 ) 

227 self.registry = butler.registry.copy(defaults) 

228 self.datastore = butler.datastore 

229 self.storageClasses = butler.storageClasses 

230 self._config: ButlerConfig = butler._config 

231 else: 

232 # Can only look for strings in the known repos list. 

233 if isinstance(config, str): 

234 # Somehow ButlerConfig fails in some cases if config is a 

235 # ResourcePath, force it back to string here. 

236 config = str(self.get_repo_uri(config, True)) 

237 try: 

238 self._config = ButlerConfig(config, searchPaths=searchPaths) 

239 except FileNotFoundError as e: 

240 if known := self.get_known_repos(): 

241 aliases = f"(known aliases: {', '.join(known)})" 

242 else: 

243 aliases = "(no known aliases)" 

244 raise FileNotFoundError(f"{e} {aliases}") from e 

245 try: 

246 if "root" in self._config: 

247 butlerRoot = self._config["root"] 

248 else: 

249 butlerRoot = self._config.configDir 

250 if writeable is None: 

251 writeable = run is not None 

252 self.registry = Registry.fromConfig( 

253 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

254 ) 

255 self.datastore = Datastore.fromConfig( 

256 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

257 ) 

258 self.storageClasses = StorageClassFactory() 

259 self.storageClasses.addFromConfig(self._config) 

260 except Exception: 

261 # Failures here usually mean that configuration is incomplete, 

262 # just issue an error message which includes config file URI. 

263 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

264 raise 

265 

266 # For execution butler the datastore needs a special 

267 # dependency-inversion trick. This is not used by regular butler, 

268 # but we do not have a way to distinguish regular butler from execution 

269 # butler. 

270 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

271 

272 if "run" in self._config or "collection" in self._config: 

273 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

274 

275 GENERATION: ClassVar[int] = 3 

276 """This is a Generation 3 Butler. 

277 

278 This attribute may be removed in the future, once the Generation 2 Butler 

279 interface has been fully retired; it should only be used in transitional 

280 code. 

281 """ 

282 

283 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

284 """Return DatasetType defined in registry given dataset type name.""" 

285 try: 

286 return self.registry.getDatasetType(name) 

287 except MissingDatasetTypeError: 

288 return None 

289 

290 @classmethod 

291 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: 

292 """Look up the label in a butler repository index. 

293 

294 Parameters 

295 ---------- 

296 label : `str` 

297 Label of the Butler repository to look up. 

298 return_label : `bool`, optional 

299 If ``label`` cannot be found in the repository index (either 

300 because index is not defined or ``label`` is not in the index) and 

301 ``return_label`` is `True` then return ``ResourcePath(label)``. 

302 If ``return_label`` is `False` (default) then an exception will be 

303 raised instead. 

304 

305 Returns 

306 ------- 

307 uri : `lsst.resources.ResourcePath` 

308 URI to the Butler repository associated with the given label or 

309 default value if it is provided. 

310 

311 Raises 

312 ------ 

313 KeyError 

314 Raised if the label is not found in the index, or if an index 

315 is not defined, and ``return_label`` is `False`. 

316 

317 Notes 

318 ----- 

319 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

320 information is discovered. 

321 """ 

322 return ButlerRepoIndex.get_repo_uri(label, return_label) 

323 

324 @classmethod 

325 def get_known_repos(cls) -> Set[str]: 

326 """Retrieve the list of known repository labels. 

327 

328 Returns 

329 ------- 

330 repos : `set` of `str` 

331 All the known labels. Can be empty if no index can be found. 

332 

333 Notes 

334 ----- 

335 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

336 information is discovered. 

337 """ 

338 return ButlerRepoIndex.get_known_repos() 

339 

340 @staticmethod 

341 def makeRepo( 

342 root: ResourcePathExpression, 

343 config: Union[Config, str, None] = None, 

344 dimensionConfig: Union[Config, str, None] = None, 

345 standalone: bool = False, 

346 searchPaths: Optional[List[str]] = None, 

347 forceConfigRoot: bool = True, 

348 outfile: Optional[ResourcePathExpression] = None, 

349 overwrite: bool = False, 

350 ) -> Config: 

351 """Create an empty data repository by adding a butler.yaml config 

352 to a repository root directory. 

353 

354 Parameters 

355 ---------- 

356 root : `lsst.resources.ResourcePathExpression` 

357 Path or URI to the root location of the new repository. Will be 

358 created if it does not exist. 

359 config : `Config` or `str`, optional 

360 Configuration to write to the repository, after setting any 

361 root-dependent Registry or Datastore config options. Can not 

362 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

363 configuration will be used. Root-dependent config options 

364 specified in this config are overwritten if ``forceConfigRoot`` 

365 is `True`. 

366 dimensionConfig : `Config` or `str`, optional 

367 Configuration for dimensions, will be used to initialize registry 

368 database. 

369 standalone : `bool` 

370 If True, write all expanded defaults, not just customized or 

371 repository-specific settings. 

372 This (mostly) decouples the repository from the default 

373 configuration, insulating it from changes to the defaults (which 

374 may be good or bad, depending on the nature of the changes). 

375 Future *additions* to the defaults will still be picked up when 

376 initializing `Butlers` to repos created with ``standalone=True``. 

377 searchPaths : `list` of `str`, optional 

378 Directory paths to search when calculating the full butler 

379 configuration. 

380 forceConfigRoot : `bool`, optional 

381 If `False`, any values present in the supplied ``config`` that 

382 would normally be reset are not overridden and will appear 

383 directly in the output config. This allows non-standard overrides 

384 of the root directory for a datastore or registry to be given. 

385 If this parameter is `True` the values for ``root`` will be 

386 forced into the resulting config if appropriate. 

387 outfile : `lss.resources.ResourcePathExpression`, optional 

388 If not-`None`, the output configuration will be written to this 

389 location rather than into the repository itself. Can be a URI 

390 string. Can refer to a directory that will be used to write 

391 ``butler.yaml``. 

392 overwrite : `bool`, optional 

393 Create a new configuration file even if one already exists 

394 in the specified output location. Default is to raise 

395 an exception. 

396 

397 Returns 

398 ------- 

399 config : `Config` 

400 The updated `Config` instance written to the repo. 

401 

402 Raises 

403 ------ 

404 ValueError 

405 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

406 regular Config (as these subclasses would make it impossible to 

407 support ``standalone=False``). 

408 FileExistsError 

409 Raised if the output config file already exists. 

410 os.error 

411 Raised if the directory does not exist, exists but is not a 

412 directory, or cannot be created. 

413 

414 Notes 

415 ----- 

416 Note that when ``standalone=False`` (the default), the configuration 

417 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

418 construct the repository should also be used to construct any Butlers 

419 to avoid configuration inconsistencies. 

420 """ 

421 if isinstance(config, (ButlerConfig, ConfigSubset)): 

422 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

423 

424 # Ensure that the root of the repository exists or can be made 

425 root_uri = ResourcePath(root, forceDirectory=True) 

426 root_uri.mkdir() 

427 

428 config = Config(config) 

429 

430 # If we are creating a new repo from scratch with relative roots, 

431 # do not propagate an explicit root from the config file 

432 if "root" in config: 

433 del config["root"] 

434 

435 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

436 imported_class = doImportType(full["datastore", "cls"]) 

437 if not issubclass(imported_class, Datastore): 

438 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

439 datastoreClass: Type[Datastore] = imported_class 

440 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

441 

442 # if key exists in given config, parse it, otherwise parse the defaults 

443 # in the expanded config 

444 if config.get(("registry", "db")): 

445 registryConfig = RegistryConfig(config) 

446 else: 

447 registryConfig = RegistryConfig(full) 

448 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

449 if defaultDatabaseUri is not None: 

450 Config.updateParameters( 

451 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

452 ) 

453 else: 

454 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

455 

456 if standalone: 

457 config.merge(full) 

458 else: 

459 # Always expand the registry.managers section into the per-repo 

460 # config, because after the database schema is created, it's not 

461 # allowed to change anymore. Note that in the standalone=True 

462 # branch, _everything_ in the config is expanded, so there's no 

463 # need to special case this. 

464 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

465 configURI: ResourcePathExpression 

466 if outfile is not None: 

467 # When writing to a separate location we must include 

468 # the root of the butler repo in the config else it won't know 

469 # where to look. 

470 config["root"] = root_uri.geturl() 

471 configURI = outfile 

472 else: 

473 configURI = root_uri 

474 # Strip obscore configuration, if it is present, before writing config 

475 # to a file, obscore config will be stored in registry. 

476 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

477 config_to_write = config.copy() 

478 del config_to_write[obscore_config_key] 

479 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

480 # configFile attribute is updated, need to copy it to original. 

481 config.configFile = config_to_write.configFile 

482 else: 

483 config.dumpToUri(configURI, overwrite=overwrite) 

484 

485 # Create Registry and populate tables 

486 registryConfig = RegistryConfig(config.get("registry")) 

487 dimensionConfig = DimensionConfig(dimensionConfig) 

488 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

489 

490 log.verbose("Wrote new Butler configuration file to %s", configURI) 

491 

492 return config 

493 

494 @classmethod 

495 def _unpickle( 

496 cls, 

497 config: ButlerConfig, 

498 collections: Optional[tuple[str, ...]], 

499 run: Optional[str], 

500 defaultDataId: Dict[str, str], 

501 writeable: bool, 

502 ) -> Butler: 

503 """Callable used to unpickle a Butler. 

504 

505 We prefer not to use ``Butler.__init__`` directly so we can force some 

506 of its many arguments to be keyword-only (note that ``__reduce__`` 

507 can only invoke callables with positional arguments). 

508 

509 Parameters 

510 ---------- 

511 config : `ButlerConfig` 

512 Butler configuration, already coerced into a true `ButlerConfig` 

513 instance (and hence after any search paths for overrides have been 

514 utilized). 

515 collections : `tuple` [ `str` ] 

516 Names of the default collections to read from. 

517 run : `str`, optional 

518 Name of the default `~CollectionType.RUN` collection to write to. 

519 defaultDataId : `dict` [ `str`, `str` ] 

520 Default data ID values. 

521 writeable : `bool` 

522 Whether the Butler should support write operations. 

523 

524 Returns 

525 ------- 

526 butler : `Butler` 

527 A new `Butler` instance. 

528 """ 

529 # MyPy doesn't recognize that the kwargs below are totally valid; it 

530 # seems to think '**defaultDataId* is a _positional_ argument! 

531 return cls( 

532 config=config, 

533 collections=collections, 

534 run=run, 

535 writeable=writeable, 

536 **defaultDataId, # type: ignore 

537 ) 

538 

539 def __reduce__(self) -> tuple: 

540 """Support pickling.""" 

541 return ( 

542 Butler._unpickle, 

543 ( 

544 self._config, 

545 self.collections, 

546 self.run, 

547 self.registry.defaults.dataId.byName(), 

548 self.registry.isWriteable(), 

549 ), 

550 ) 

551 

552 def __str__(self) -> str: 

553 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

554 self.collections, self.run, self.datastore, self.registry 

555 ) 

556 

557 def isWriteable(self) -> bool: 

558 """Return `True` if this `Butler` supports write operations.""" 

559 return self.registry.isWriteable() 

560 

561 @contextlib.contextmanager 

562 def transaction(self) -> Iterator[None]: 

563 """Context manager supporting `Butler` transactions. 

564 

565 Transactions can be nested. 

566 """ 

567 with self.registry.transaction(): 

568 with self.datastore.transaction(): 

569 yield 

570 

571 def _standardizeArgs( 

572 self, 

573 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

574 dataId: Optional[DataId] = None, 

575 for_put: bool = True, 

576 **kwargs: Any, 

577 ) -> Tuple[DatasetType, Optional[DataId]]: 

578 """Standardize the arguments passed to several Butler APIs. 

579 

580 Parameters 

581 ---------- 

582 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

583 When `DatasetRef` the `dataId` should be `None`. 

584 Otherwise the `DatasetType` or name thereof. 

585 dataId : `dict` or `DataCoordinate` 

586 A `dict` of `Dimension` link name, value pairs that label the 

587 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

588 should be provided as the second argument. 

589 for_put : `bool`, optional 

590 If `True` this call is invoked as part of a `Butler.put()`. 

591 Otherwise it is assumed to be part of a `Butler.get()`. This 

592 parameter is only relevant if there is dataset type 

593 inconsistency. 

594 **kwargs 

595 Additional keyword arguments used to augment or construct a 

596 `DataCoordinate`. See `DataCoordinate.standardize` 

597 parameters. 

598 

599 Returns 

600 ------- 

601 datasetType : `DatasetType` 

602 A `DatasetType` instance extracted from ``datasetRefOrType``. 

603 dataId : `dict` or `DataId`, optional 

604 Argument that can be used (along with ``kwargs``) to construct a 

605 `DataId`. 

606 

607 Notes 

608 ----- 

609 Butler APIs that conceptually need a DatasetRef also allow passing a 

610 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

611 keyword arguments that can be used to construct one) separately. This 

612 method accepts those arguments and always returns a true `DatasetType` 

613 and a `DataId` or `dict`. 

614 

615 Standardization of `dict` vs `DataId` is best handled by passing the 

616 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

617 generally similarly flexible. 

618 """ 

619 externalDatasetType: Optional[DatasetType] = None 

620 internalDatasetType: Optional[DatasetType] = None 

621 if isinstance(datasetRefOrType, DatasetRef): 

622 if dataId is not None or kwargs: 

623 raise ValueError("DatasetRef given, cannot use dataId as well") 

624 externalDatasetType = datasetRefOrType.datasetType 

625 dataId = datasetRefOrType.dataId 

626 else: 

627 # Don't check whether DataId is provided, because Registry APIs 

628 # can usually construct a better error message when it wasn't. 

629 if isinstance(datasetRefOrType, DatasetType): 

630 externalDatasetType = datasetRefOrType 

631 else: 

632 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

633 

634 # Check that they are self-consistent 

635 if externalDatasetType is not None: 

636 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

637 if externalDatasetType != internalDatasetType: 

638 # We can allow differences if they are compatible, depending 

639 # on whether this is a get or a put. A get requires that 

640 # the python type associated with the datastore can be 

641 # converted to the user type. A put requires that the user 

642 # supplied python type can be converted to the internal 

643 # type expected by registry. 

644 relevantDatasetType = internalDatasetType 

645 if for_put: 

646 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

647 else: 

648 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

649 relevantDatasetType = externalDatasetType 

650 if not is_compatible: 

651 raise ValueError( 

652 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

653 f"registry definition ({internalDatasetType})" 

654 ) 

655 # Override the internal definition. 

656 internalDatasetType = relevantDatasetType 

657 

658 assert internalDatasetType is not None 

659 return internalDatasetType, dataId 

660 

661 def _rewrite_data_id( 

662 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

663 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

664 """Rewrite a data ID taking into account dimension records. 

665 

666 Take a Data ID and keyword args and rewrite it if necessary to 

667 allow the user to specify dimension records rather than dimension 

668 primary values. 

669 

670 This allows a user to include a dataId dict with keys of 

671 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

672 the integer exposure ID. It also allows a string to be given 

673 for a dimension value rather than the integer ID if that is more 

674 convenient. For example, rather than having to specifyin the 

675 detector with ``detector.full_name``, a string given for ``detector`` 

676 will be interpreted as the full name and converted to the integer 

677 value. 

678 

679 Keyword arguments can also use strings for dimensions like detector 

680 and exposure but python does not allow them to include ``.`` and 

681 so the ``exposure.day_obs`` syntax can not be used in a keyword 

682 argument. 

683 

684 Parameters 

685 ---------- 

686 dataId : `dict` or `DataCoordinate` 

687 A `dict` of `Dimension` link name, value pairs that will label the 

688 `DatasetRef` within a Collection. 

689 datasetType : `DatasetType` 

690 The dataset type associated with this dataId. Required to 

691 determine the relevant dimensions. 

692 **kwargs 

693 Additional keyword arguments used to augment or construct a 

694 `DataId`. See `DataId` parameters. 

695 

696 Returns 

697 ------- 

698 dataId : `dict` or `DataCoordinate` 

699 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

700 no keyword arguments, the original dataId will be returned 

701 unchanged. 

702 **kwargs : `dict` 

703 Any unused keyword arguments (would normally be empty dict). 

704 """ 

705 # Do nothing if we have a standalone DataCoordinate. 

706 if isinstance(dataId, DataCoordinate) and not kwargs: 

707 return dataId, kwargs 

708 

709 # Process dimension records that are using record information 

710 # rather than ids 

711 newDataId: Dict[str, DataIdValue] = {} 

712 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

713 

714 # if all the dataId comes from keyword parameters we do not need 

715 # to do anything here because they can't be of the form 

716 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

717 if dataId: 

718 for k, v in dataId.items(): 

719 # If we have a Dimension we do not need to do anything 

720 # because it cannot be a compound key. 

721 if isinstance(k, str) and "." in k: 

722 # Someone is using a more human-readable dataId 

723 dimensionName, record = k.split(".", 1) 

724 byRecord[dimensionName][record] = v 

725 elif isinstance(k, Dimension): 

726 newDataId[k.name] = v 

727 else: 

728 newDataId[k] = v 

729 

730 # Go through the updated dataId and check the type in case someone is 

731 # using an alternate key. We have already filtered out the compound 

732 # keys dimensions.record format. 

733 not_dimensions = {} 

734 

735 # Will need to look in the dataId and the keyword arguments 

736 # and will remove them if they need to be fixed or are unrecognized. 

737 for dataIdDict in (newDataId, kwargs): 

738 # Use a list so we can adjust the dict safely in the loop 

739 for dimensionName in list(dataIdDict): 

740 value = dataIdDict[dimensionName] 

741 try: 

742 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

743 except KeyError: 

744 # This is not a real dimension 

745 not_dimensions[dimensionName] = value 

746 del dataIdDict[dimensionName] 

747 continue 

748 

749 # Convert an integral type to an explicit int to simplify 

750 # comparisons here 

751 if isinstance(value, numbers.Integral): 

752 value = int(value) 

753 

754 if not isinstance(value, dimension.primaryKey.getPythonType()): 

755 for alternate in dimension.alternateKeys: 

756 if isinstance(value, alternate.getPythonType()): 

757 byRecord[dimensionName][alternate.name] = value 

758 del dataIdDict[dimensionName] 

759 log.debug( 

760 "Converting dimension %s to %s.%s=%s", 

761 dimensionName, 

762 dimensionName, 

763 alternate.name, 

764 value, 

765 ) 

766 break 

767 else: 

768 log.warning( 

769 "Type mismatch found for value '%r' provided for dimension %s. " 

770 "Could not find matching alternative (primary key has type %s) " 

771 "so attempting to use as-is.", 

772 value, 

773 dimensionName, 

774 dimension.primaryKey.getPythonType(), 

775 ) 

776 

777 # By this point kwargs and newDataId should only include valid 

778 # dimensions. Merge kwargs in to the new dataId and log if there 

779 # are dimensions in both (rather than calling update). 

780 for k, v in kwargs.items(): 

781 if k in newDataId and newDataId[k] != v: 

782 log.debug( 

783 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

784 ) 

785 newDataId[k] = v 

786 # No need to retain any values in kwargs now. 

787 kwargs = {} 

788 

789 # If we have some unrecognized dimensions we have to try to connect 

790 # them to records in other dimensions. This is made more complicated 

791 # by some dimensions having records with clashing names. A mitigation 

792 # is that we can tell by this point which dimensions are missing 

793 # for the DatasetType but this does not work for calibrations 

794 # where additional dimensions can be used to constrain the temporal 

795 # axis. 

796 if not_dimensions: 

797 # Search for all dimensions even if we have been given a value 

798 # explicitly. In some cases records are given as well as the 

799 # actually dimension and this should not be an error if they 

800 # match. 

801 mandatoryDimensions = datasetType.dimensions.names # - provided 

802 

803 candidateDimensions: Set[str] = set() 

804 candidateDimensions.update(mandatoryDimensions) 

805 

806 # For calibrations we may well be needing temporal dimensions 

807 # so rather than always including all dimensions in the scan 

808 # restrict things a little. It is still possible for there 

809 # to be confusion over day_obs in visit vs exposure for example. 

810 # If we are not searching calibration collections things may 

811 # fail but they are going to fail anyway because of the 

812 # ambiguousness of the dataId... 

813 if datasetType.isCalibration(): 

814 for dim in self.registry.dimensions.getStaticDimensions(): 

815 if dim.temporal: 

816 candidateDimensions.add(str(dim)) 

817 

818 # Look up table for the first association with a dimension 

819 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

820 

821 # Keep track of whether an item is associated with multiple 

822 # dimensions. 

823 counter: Counter[str] = Counter() 

824 assigned: Dict[str, Set[str]] = defaultdict(set) 

825 

826 # Go through the missing dimensions and associate the 

827 # given names with records within those dimensions 

828 matched_dims = set() 

829 for dimensionName in candidateDimensions: 

830 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

831 fields = dimension.metadata.names | dimension.uniqueKeys.names 

832 for field in not_dimensions: 

833 if field in fields: 

834 guessedAssociation[dimensionName][field] = not_dimensions[field] 

835 counter[dimensionName] += 1 

836 assigned[field].add(dimensionName) 

837 matched_dims.add(field) 

838 

839 # Calculate the fields that matched nothing. 

840 never_found = set(not_dimensions) - matched_dims 

841 

842 if never_found: 

843 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

844 

845 # There is a chance we have allocated a single dataId item 

846 # to multiple dimensions. Need to decide which should be retained. 

847 # For now assume that the most popular alternative wins. 

848 # This means that day_obs with seq_num will result in 

849 # exposure.day_obs and not visit.day_obs 

850 # Also prefer an explicitly missing dimension over an inferred 

851 # temporal dimension. 

852 for fieldName, assignedDimensions in assigned.items(): 

853 if len(assignedDimensions) > 1: 

854 # Pick the most popular (preferring mandatory dimensions) 

855 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

856 if requiredButMissing: 

857 candidateDimensions = requiredButMissing 

858 else: 

859 candidateDimensions = assignedDimensions 

860 

861 # If this is a choice between visit and exposure and 

862 # neither was a required part of the dataset type, 

863 # (hence in this branch) always prefer exposure over 

864 # visit since exposures are always defined and visits 

865 # are defined from exposures. 

866 if candidateDimensions == {"exposure", "visit"}: 

867 candidateDimensions = {"exposure"} 

868 

869 # Select the relevant items and get a new restricted 

870 # counter. 

871 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

872 duplicatesCounter: Counter[str] = Counter() 

873 duplicatesCounter.update(theseCounts) 

874 

875 # Choose the most common. If they are equally common 

876 # we will pick the one that was found first. 

877 # Returns a list of tuples 

878 selected = duplicatesCounter.most_common(1)[0][0] 

879 

880 log.debug( 

881 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

882 " Removed ambiguity by choosing dimension %s.", 

883 fieldName, 

884 ", ".join(assignedDimensions), 

885 selected, 

886 ) 

887 

888 for candidateDimension in assignedDimensions: 

889 if candidateDimension != selected: 

890 del guessedAssociation[candidateDimension][fieldName] 

891 

892 # Update the record look up dict with the new associations 

893 for dimensionName, values in guessedAssociation.items(): 

894 if values: # A dict might now be empty 

895 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

896 byRecord[dimensionName].update(values) 

897 

898 if byRecord: 

899 # Some record specifiers were found so we need to convert 

900 # them to the Id form 

901 for dimensionName, values in byRecord.items(): 

902 if dimensionName in newDataId: 

903 log.debug( 

904 "DataId specified explicit %s dimension value of %s in addition to" 

905 " general record specifiers for it of %s. Ignoring record information.", 

906 dimensionName, 

907 newDataId[dimensionName], 

908 str(values), 

909 ) 

910 # Get the actual record and compare with these values. 

911 try: 

912 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

913 except DataIdError: 

914 raise ValueError( 

915 f"Could not find dimension '{dimensionName}'" 

916 f" with dataId {newDataId} as part of comparing with" 

917 f" record values {byRecord[dimensionName]}" 

918 ) from None 

919 if len(recs) == 1: 

920 errmsg: List[str] = [] 

921 for k, v in values.items(): 

922 if (recval := getattr(recs[0], k)) != v: 

923 errmsg.append(f"{k}({recval} != {v})") 

924 if errmsg: 

925 raise ValueError( 

926 f"Dimension {dimensionName} in dataId has explicit value" 

927 " inconsistent with records: " + ", ".join(errmsg) 

928 ) 

929 else: 

930 # Multiple matches for an explicit dimension 

931 # should never happen but let downstream complain. 

932 pass 

933 continue 

934 

935 # Build up a WHERE expression 

936 bind = {k: v for k, v in values.items()} 

937 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

938 

939 # Hopefully we get a single record that matches 

940 records = set( 

941 self.registry.queryDimensionRecords( 

942 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

943 ) 

944 ) 

945 

946 if len(records) != 1: 

947 if len(records) > 1: 

948 # visit can have an ambiguous answer without involving 

949 # visit_system. The default visit_system is defined 

950 # by the instrument. 

951 if ( 

952 dimensionName == "visit" 

953 and "visit_system_membership" in self.registry.dimensions 

954 and "visit_system" in self.registry.dimensions["instrument"].metadata 

955 ): 

956 instrument_records = list( 

957 self.registry.queryDimensionRecords( 

958 "instrument", 

959 dataId=newDataId, 

960 **kwargs, 

961 ) 

962 ) 

963 if len(instrument_records) == 1: 

964 visit_system = instrument_records[0].visit_system 

965 if visit_system is None: 

966 # Set to a value that will never match. 

967 visit_system = -1 

968 

969 # Look up each visit in the 

970 # visit_system_membership records. 

971 for rec in records: 

972 membership = list( 

973 self.registry.queryDimensionRecords( 

974 # Use bind to allow zero results. 

975 # This is a fully-specified query. 

976 "visit_system_membership", 

977 where="instrument = inst AND visit_system = system AND visit = v", 

978 bind=dict( 

979 inst=instrument_records[0].name, system=visit_system, v=rec.id 

980 ), 

981 ) 

982 ) 

983 if membership: 

984 # This record is the right answer. 

985 records = set([rec]) 

986 break 

987 

988 # The ambiguity may have been resolved so check again. 

989 if len(records) > 1: 

990 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

991 for r in records: 

992 log.debug("- %s", str(r)) 

993 raise ValueError( 

994 f"DataId specification for dimension {dimensionName} is not" 

995 f" uniquely constrained to a single dataset by {values}." 

996 f" Got {len(records)} results." 

997 ) 

998 else: 

999 raise ValueError( 

1000 f"DataId specification for dimension {dimensionName} matched no" 

1001 f" records when constrained by {values}" 

1002 ) 

1003 

1004 # Get the primary key from the real dimension object 

1005 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1006 if not isinstance(dimension, Dimension): 

1007 raise RuntimeError( 

1008 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1009 ) 

1010 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1011 

1012 return newDataId, kwargs 

1013 

1014 def _findDatasetRef( 

1015 self, 

1016 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1017 dataId: Optional[DataId] = None, 

1018 *, 

1019 collections: Any = None, 

1020 predict: bool = False, 

1021 run: str | None = None, 

1022 **kwargs: Any, 

1023 ) -> DatasetRef: 

1024 """Shared logic for methods that start with a search for a dataset in 

1025 the registry. 

1026 

1027 Parameters 

1028 ---------- 

1029 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1030 When `DatasetRef` the `dataId` should be `None`. 

1031 Otherwise the `DatasetType` or name thereof. 

1032 dataId : `dict` or `DataCoordinate`, optional 

1033 A `dict` of `Dimension` link name, value pairs that label the 

1034 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1035 should be provided as the first argument. 

1036 collections : Any, optional 

1037 Collections to be searched, overriding ``self.collections``. 

1038 Can be any of the types supported by the ``collections`` argument 

1039 to butler construction. 

1040 predict : `bool`, optional 

1041 If `True`, return a newly created `DatasetRef` with a unique 

1042 dataset ID if finding a reference in the `Registry` fails. 

1043 Defaults to `False`. 

1044 run : `str`, optional 

1045 Run collection name to use for creating `DatasetRef` for predicted 

1046 datasets. Only used if ``predict`` is `True`. 

1047 **kwargs 

1048 Additional keyword arguments used to augment or construct a 

1049 `DataId`. See `DataId` parameters. 

1050 

1051 Returns 

1052 ------- 

1053 ref : `DatasetRef` 

1054 A reference to the dataset identified by the given arguments. 

1055 This can be the same dataset reference as given if it was 

1056 resolved. 

1057 

1058 Raises 

1059 ------ 

1060 LookupError 

1061 Raised if no matching dataset exists in the `Registry` (and 

1062 ``predict`` is `False`). 

1063 ValueError 

1064 Raised if a resolved `DatasetRef` was passed as an input, but it 

1065 differs from the one found in the registry. 

1066 TypeError 

1067 Raised if no collections were provided. 

1068 """ 

1069 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1070 if isinstance(datasetRefOrType, DatasetRef): 

1071 if collections is not None: 

1072 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1073 return datasetRefOrType 

1074 timespan: Optional[Timespan] = None 

1075 

1076 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1077 

1078 if datasetType.isCalibration(): 

1079 # Because this is a calibration dataset, first try to make a 

1080 # standardize the data ID without restricting the dimensions to 

1081 # those of the dataset type requested, because there may be extra 

1082 # dimensions that provide temporal information for a validity-range 

1083 # lookup. 

1084 dataId = DataCoordinate.standardize( 

1085 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1086 ) 

1087 if dataId.graph.temporal: 

1088 dataId = self.registry.expandDataId(dataId) 

1089 timespan = dataId.timespan 

1090 else: 

1091 # Standardize the data ID to just the dimensions of the dataset 

1092 # type instead of letting registry.findDataset do it, so we get the 

1093 # result even if no dataset is found. 

1094 dataId = DataCoordinate.standardize( 

1095 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1096 ) 

1097 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1098 # present in the current collection. 

1099 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1100 if ref is None: 

1101 if predict: 

1102 if run is None: 

1103 run = self.run 

1104 if run is None: 

1105 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1106 return DatasetRef(datasetType, dataId, run=run) 

1107 else: 

1108 if collections is None: 

1109 collections = self.registry.defaults.collections 

1110 raise LookupError( 

1111 f"Dataset {datasetType.name} with data ID {dataId} " 

1112 f"could not be found in collections {collections}." 

1113 ) 

1114 if datasetType != ref.datasetType: 

1115 # If they differ it is because the user explicitly specified 

1116 # a compatible dataset type to this call rather than using the 

1117 # registry definition. The DatasetRef must therefore be recreated 

1118 # using the user definition such that the expected type is 

1119 # returned. 

1120 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1121 

1122 return ref 

1123 

1124 @transactional 

1125 @deprecated( 

1126 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1127 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1128 " were relying on the run parameter to determine the run." 

1129 " Will be removed after v27.0.", 

1130 version="v26.0", 

1131 category=FutureWarning, 

1132 ) 

1133 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1134 # Docstring inherited. 

1135 return self.put(obj, ref) 

1136 

1137 @transactional 

1138 def put( 

1139 self, 

1140 obj: Any, 

1141 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1142 /, 

1143 dataId: Optional[DataId] = None, 

1144 *, 

1145 run: Optional[str] = None, 

1146 **kwargs: Any, 

1147 ) -> DatasetRef: 

1148 """Store and register a dataset. 

1149 

1150 Parameters 

1151 ---------- 

1152 obj : `object` 

1153 The dataset. 

1154 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1155 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1156 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1157 `DatasetRef` is given the run and ID are used directly. 

1158 dataId : `dict` or `DataCoordinate` 

1159 A `dict` of `Dimension` link name, value pairs that label the 

1160 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1161 should be provided as the second argument. 

1162 run : `str`, optional 

1163 The name of the run the dataset should be added to, overriding 

1164 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1165 **kwargs 

1166 Additional keyword arguments used to augment or construct a 

1167 `DataCoordinate`. See `DataCoordinate.standardize` 

1168 parameters. Not used if a resolve `DatasetRef` is provided. 

1169 

1170 Returns 

1171 ------- 

1172 ref : `DatasetRef` 

1173 A reference to the stored dataset, updated with the correct id if 

1174 given. 

1175 

1176 Raises 

1177 ------ 

1178 TypeError 

1179 Raised if the butler is read-only or if no run has been provided. 

1180 """ 

1181 if isinstance(datasetRefOrType, DatasetRef): 

1182 # This is a direct put of predefined DatasetRef. 

1183 log.debug("Butler put direct: %s", datasetRefOrType) 

1184 if run is not None: 

1185 warnings.warn("Run collection is not used for DatasetRef") 

1186 # If registry already has a dataset with the same dataset ID, 

1187 # dataset type and DataId, then _importDatasets will do nothing and 

1188 # just return an original ref. We have to raise in this case, there 

1189 # is a datastore check below for that. 

1190 self.registry._importDatasets([datasetRefOrType], expand=True) 

1191 # Before trying to write to the datastore check that it does not 

1192 # know this dataset. This is prone to races, of course. 

1193 if self.datastore.knows(datasetRefOrType): 

1194 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1195 # Try to write dataset to the datastore, if it fails due to a race 

1196 # with another write, the content of stored data may be 

1197 # unpredictable. 

1198 try: 

1199 self.datastore.put(obj, datasetRefOrType) 

1200 except IntegrityError as e: 

1201 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") 

1202 return datasetRefOrType 

1203 

1204 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1205 if not self.isWriteable(): 

1206 raise TypeError("Butler is read-only.") 

1207 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1208 

1209 # Handle dimension records in dataId 

1210 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1211 

1212 # Add Registry Dataset entry. 

1213 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1214 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1215 self.datastore.put(obj, ref) 

1216 

1217 return ref 

1218 

1219 @deprecated( 

1220 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1221 " Please use Butler.get(). Will be removed after v27.0.", 

1222 version="v26.0", 

1223 category=FutureWarning, 

1224 ) 

1225 def getDirect( 

1226 self, 

1227 ref: DatasetRef, 

1228 *, 

1229 parameters: Optional[Dict[str, Any]] = None, 

1230 storageClass: Optional[Union[StorageClass, str]] = None, 

1231 ) -> Any: 

1232 """Retrieve a stored dataset. 

1233 

1234 Parameters 

1235 ---------- 

1236 ref : `DatasetRef` 

1237 Resolved reference to an already stored dataset. 

1238 parameters : `dict` 

1239 Additional StorageClass-defined options to control reading, 

1240 typically used to efficiently read only a subset of the dataset. 

1241 storageClass : `StorageClass` or `str`, optional 

1242 The storage class to be used to override the Python type 

1243 returned by this method. By default the returned type matches 

1244 the dataset type definition for this dataset. Specifying a 

1245 read `StorageClass` can force a different type to be returned. 

1246 This type must be compatible with the original type. 

1247 

1248 Returns 

1249 ------- 

1250 obj : `object` 

1251 The dataset. 

1252 """ 

1253 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1254 

1255 @deprecated( 

1256 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1257 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1258 version="v26.0", 

1259 category=FutureWarning, 

1260 ) 

1261 def getDirectDeferred( 

1262 self, 

1263 ref: DatasetRef, 

1264 *, 

1265 parameters: Union[dict, None] = None, 

1266 storageClass: str | StorageClass | None = None, 

1267 ) -> DeferredDatasetHandle: 

1268 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1269 from a resolved `DatasetRef`. 

1270 

1271 Parameters 

1272 ---------- 

1273 ref : `DatasetRef` 

1274 Resolved reference to an already stored dataset. 

1275 parameters : `dict` 

1276 Additional StorageClass-defined options to control reading, 

1277 typically used to efficiently read only a subset of the dataset. 

1278 storageClass : `StorageClass` or `str`, optional 

1279 The storage class to be used to override the Python type 

1280 returned by this method. By default the returned type matches 

1281 the dataset type definition for this dataset. Specifying a 

1282 read `StorageClass` can force a different type to be returned. 

1283 This type must be compatible with the original type. 

1284 

1285 Returns 

1286 ------- 

1287 obj : `DeferredDatasetHandle` 

1288 A handle which can be used to retrieve a dataset at a later time. 

1289 

1290 Raises 

1291 ------ 

1292 LookupError 

1293 Raised if no matching dataset exists in the `Registry`. 

1294 """ 

1295 # Check thad dataset actuall exists. 

1296 if not self.datastore.exists(ref): 

1297 raise LookupError(f"Dataset reference {ref} does not exist.") 

1298 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1299 

1300 def getDeferred( 

1301 self, 

1302 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1303 /, 

1304 dataId: Optional[DataId] = None, 

1305 *, 

1306 parameters: Union[dict, None] = None, 

1307 collections: Any = None, 

1308 storageClass: str | StorageClass | None = None, 

1309 **kwargs: Any, 

1310 ) -> DeferredDatasetHandle: 

1311 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1312 after an immediate registry lookup. 

1313 

1314 Parameters 

1315 ---------- 

1316 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1317 When `DatasetRef` the `dataId` should be `None`. 

1318 Otherwise the `DatasetType` or name thereof. 

1319 dataId : `dict` or `DataCoordinate`, optional 

1320 A `dict` of `Dimension` link name, value pairs that label the 

1321 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1322 should be provided as the first argument. 

1323 parameters : `dict` 

1324 Additional StorageClass-defined options to control reading, 

1325 typically used to efficiently read only a subset of the dataset. 

1326 collections : Any, optional 

1327 Collections to be searched, overriding ``self.collections``. 

1328 Can be any of the types supported by the ``collections`` argument 

1329 to butler construction. 

1330 storageClass : `StorageClass` or `str`, optional 

1331 The storage class to be used to override the Python type 

1332 returned by this method. By default the returned type matches 

1333 the dataset type definition for this dataset. Specifying a 

1334 read `StorageClass` can force a different type to be returned. 

1335 This type must be compatible with the original type. 

1336 **kwargs 

1337 Additional keyword arguments used to augment or construct a 

1338 `DataId`. See `DataId` parameters. 

1339 

1340 Returns 

1341 ------- 

1342 obj : `DeferredDatasetHandle` 

1343 A handle which can be used to retrieve a dataset at a later time. 

1344 

1345 Raises 

1346 ------ 

1347 LookupError 

1348 Raised if no matching dataset exists in the `Registry`. 

1349 ValueError 

1350 Raised if a resolved `DatasetRef` was passed as an input, but it 

1351 differs from the one found in the registry. 

1352 TypeError 

1353 Raised if no collections were provided. 

1354 """ 

1355 if isinstance(datasetRefOrType, DatasetRef) and not self.datastore.exists(datasetRefOrType): 

1356 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1357 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1358 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1359 

1360 def get( 

1361 self, 

1362 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1363 /, 

1364 dataId: Optional[DataId] = None, 

1365 *, 

1366 parameters: Optional[Dict[str, Any]] = None, 

1367 collections: Any = None, 

1368 storageClass: Optional[Union[StorageClass, str]] = None, 

1369 **kwargs: Any, 

1370 ) -> Any: 

1371 """Retrieve a stored dataset. 

1372 

1373 Parameters 

1374 ---------- 

1375 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1376 When `DatasetRef` the `dataId` should be `None`. 

1377 Otherwise the `DatasetType` or name thereof. 

1378 If a resolved `DatasetRef`, the associated dataset 

1379 is returned directly without additional querying. 

1380 dataId : `dict` or `DataCoordinate` 

1381 A `dict` of `Dimension` link name, value pairs that label the 

1382 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1383 should be provided as the first argument. 

1384 parameters : `dict` 

1385 Additional StorageClass-defined options to control reading, 

1386 typically used to efficiently read only a subset of the dataset. 

1387 collections : Any, optional 

1388 Collections to be searched, overriding ``self.collections``. 

1389 Can be any of the types supported by the ``collections`` argument 

1390 to butler construction. 

1391 storageClass : `StorageClass` or `str`, optional 

1392 The storage class to be used to override the Python type 

1393 returned by this method. By default the returned type matches 

1394 the dataset type definition for this dataset. Specifying a 

1395 read `StorageClass` can force a different type to be returned. 

1396 This type must be compatible with the original type. 

1397 **kwargs 

1398 Additional keyword arguments used to augment or construct a 

1399 `DataCoordinate`. See `DataCoordinate.standardize` 

1400 parameters. 

1401 

1402 Returns 

1403 ------- 

1404 obj : `object` 

1405 The dataset. 

1406 

1407 Raises 

1408 ------ 

1409 LookupError 

1410 Raised if no matching dataset exists in the `Registry`. 

1411 TypeError 

1412 Raised if no collections were provided. 

1413 

1414 Notes 

1415 ----- 

1416 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1417 this method requires that the given data ID include temporal dimensions 

1418 beyond the dimensions of the dataset type itself, in order to find the 

1419 dataset with the appropriate validity range. For example, a "bias" 

1420 dataset with native dimensions ``{instrument, detector}`` could be 

1421 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1422 ``exposure`` is a temporal dimension. 

1423 """ 

1424 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1425 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1426 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1427 

1428 def getURIs( 

1429 self, 

1430 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1431 /, 

1432 dataId: Optional[DataId] = None, 

1433 *, 

1434 predict: bool = False, 

1435 collections: Any = None, 

1436 run: Optional[str] = None, 

1437 **kwargs: Any, 

1438 ) -> DatasetRefURIs: 

1439 """Returns the URIs associated with the dataset. 

1440 

1441 Parameters 

1442 ---------- 

1443 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1444 When `DatasetRef` the `dataId` should be `None`. 

1445 Otherwise the `DatasetType` or name thereof. 

1446 dataId : `dict` or `DataCoordinate` 

1447 A `dict` of `Dimension` link name, value pairs that label the 

1448 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1449 should be provided as the first argument. 

1450 predict : `bool` 

1451 If `True`, allow URIs to be returned of datasets that have not 

1452 been written. 

1453 collections : Any, optional 

1454 Collections to be searched, overriding ``self.collections``. 

1455 Can be any of the types supported by the ``collections`` argument 

1456 to butler construction. 

1457 run : `str`, optional 

1458 Run to use for predictions, overriding ``self.run``. 

1459 **kwargs 

1460 Additional keyword arguments used to augment or construct a 

1461 `DataCoordinate`. See `DataCoordinate.standardize` 

1462 parameters. 

1463 

1464 Returns 

1465 ------- 

1466 uris : `DatasetRefURIs` 

1467 The URI to the primary artifact associated with this dataset (if 

1468 the dataset was disassembled within the datastore this may be 

1469 `None`), and the URIs to any components associated with the dataset 

1470 artifact. (can be empty if there are no components). 

1471 """ 

1472 ref = self._findDatasetRef( 

1473 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1474 ) 

1475 return self.datastore.getURIs(ref, predict) 

1476 

1477 def getURI( 

1478 self, 

1479 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1480 /, 

1481 dataId: Optional[DataId] = None, 

1482 *, 

1483 predict: bool = False, 

1484 collections: Any = None, 

1485 run: Optional[str] = None, 

1486 **kwargs: Any, 

1487 ) -> ResourcePath: 

1488 """Return the URI to the Dataset. 

1489 

1490 Parameters 

1491 ---------- 

1492 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1493 When `DatasetRef` the `dataId` should be `None`. 

1494 Otherwise the `DatasetType` or name thereof. 

1495 dataId : `dict` or `DataCoordinate` 

1496 A `dict` of `Dimension` link name, value pairs that label the 

1497 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1498 should be provided as the first argument. 

1499 predict : `bool` 

1500 If `True`, allow URIs to be returned of datasets that have not 

1501 been written. 

1502 collections : Any, optional 

1503 Collections to be searched, overriding ``self.collections``. 

1504 Can be any of the types supported by the ``collections`` argument 

1505 to butler construction. 

1506 run : `str`, optional 

1507 Run to use for predictions, overriding ``self.run``. 

1508 **kwargs 

1509 Additional keyword arguments used to augment or construct a 

1510 `DataCoordinate`. See `DataCoordinate.standardize` 

1511 parameters. 

1512 

1513 Returns 

1514 ------- 

1515 uri : `lsst.resources.ResourcePath` 

1516 URI pointing to the Dataset within the datastore. If the 

1517 Dataset does not exist in the datastore, and if ``predict`` is 

1518 `True`, the URI will be a prediction and will include a URI 

1519 fragment "#predicted". 

1520 If the datastore does not have entities that relate well 

1521 to the concept of a URI the returned URI string will be 

1522 descriptive. The returned URI is not guaranteed to be obtainable. 

1523 

1524 Raises 

1525 ------ 

1526 LookupError 

1527 A URI has been requested for a dataset that does not exist and 

1528 guessing is not allowed. 

1529 ValueError 

1530 Raised if a resolved `DatasetRef` was passed as an input, but it 

1531 differs from the one found in the registry. 

1532 TypeError 

1533 Raised if no collections were provided. 

1534 RuntimeError 

1535 Raised if a URI is requested for a dataset that consists of 

1536 multiple artifacts. 

1537 """ 

1538 primary, components = self.getURIs( 

1539 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1540 ) 

1541 

1542 if primary is None or components: 

1543 raise RuntimeError( 

1544 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1545 "Use Butler.getURIs() instead." 

1546 ) 

1547 return primary 

1548 

1549 def retrieveArtifacts( 

1550 self, 

1551 refs: Iterable[DatasetRef], 

1552 destination: ResourcePathExpression, 

1553 transfer: str = "auto", 

1554 preserve_path: bool = True, 

1555 overwrite: bool = False, 

1556 ) -> List[ResourcePath]: 

1557 """Retrieve the artifacts associated with the supplied refs. 

1558 

1559 Parameters 

1560 ---------- 

1561 refs : iterable of `DatasetRef` 

1562 The datasets for which artifacts are to be retrieved. 

1563 A single ref can result in multiple artifacts. The refs must 

1564 be resolved. 

1565 destination : `lsst.resources.ResourcePath` or `str` 

1566 Location to write the artifacts. 

1567 transfer : `str`, optional 

1568 Method to use to transfer the artifacts. Must be one of the options 

1569 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1570 "move" is not allowed. 

1571 preserve_path : `bool`, optional 

1572 If `True` the full path of the artifact within the datastore 

1573 is preserved. If `False` the final file component of the path 

1574 is used. 

1575 overwrite : `bool`, optional 

1576 If `True` allow transfers to overwrite existing files at the 

1577 destination. 

1578 

1579 Returns 

1580 ------- 

1581 targets : `list` of `lsst.resources.ResourcePath` 

1582 URIs of file artifacts in destination location. Order is not 

1583 preserved. 

1584 

1585 Notes 

1586 ----- 

1587 For non-file datastores the artifacts written to the destination 

1588 may not match the representation inside the datastore. For example 

1589 a hierarchical data structure in a NoSQL database may well be stored 

1590 as a JSON file. 

1591 """ 

1592 return self.datastore.retrieveArtifacts( 

1593 refs, 

1594 ResourcePath(destination), 

1595 transfer=transfer, 

1596 preserve_path=preserve_path, 

1597 overwrite=overwrite, 

1598 ) 

1599 

1600 def datasetExists( 

1601 self, 

1602 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1603 dataId: Optional[DataId] = None, 

1604 *, 

1605 collections: Any = None, 

1606 **kwargs: Any, 

1607 ) -> bool: 

1608 """Return True if the Dataset is actually present in the Datastore. 

1609 

1610 Parameters 

1611 ---------- 

1612 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1613 When `DatasetRef` the `dataId` should be `None`. 

1614 Otherwise the `DatasetType` or name thereof. 

1615 dataId : `dict` or `DataCoordinate` 

1616 A `dict` of `Dimension` link name, value pairs that label the 

1617 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1618 should be provided as the first argument. 

1619 collections : Any, optional 

1620 Collections to be searched, overriding ``self.collections``. 

1621 Can be any of the types supported by the ``collections`` argument 

1622 to butler construction. 

1623 **kwargs 

1624 Additional keyword arguments used to augment or construct a 

1625 `DataCoordinate`. See `DataCoordinate.standardize` 

1626 parameters. 

1627 

1628 Raises 

1629 ------ 

1630 LookupError 

1631 Raised if the dataset is not even present in the Registry. 

1632 ValueError 

1633 Raised if a resolved `DatasetRef` was passed as an input, but it 

1634 differs from the one found in the registry. 

1635 TypeError 

1636 Raised if no collections were provided. 

1637 """ 

1638 # A resolved ref may be given that is not known to this butler. 

1639 if isinstance(datasetRefOrType, DatasetRef): 

1640 ref = self.registry.getDataset(datasetRefOrType.id) 

1641 if ref is None: 

1642 raise LookupError( 

1643 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1644 ) 

1645 else: 

1646 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1647 return self.datastore.exists(ref) 

1648 

1649 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1650 """Remove one or more `~CollectionType.RUN` collections and the 

1651 datasets within them. 

1652 

1653 Parameters 

1654 ---------- 

1655 names : `Iterable` [ `str` ] 

1656 The names of the collections to remove. 

1657 unstore : `bool`, optional 

1658 If `True` (default), delete datasets from all datastores in which 

1659 they are present, and attempt to rollback the registry deletions if 

1660 datastore deletions fail (which may not always be possible). If 

1661 `False`, datastore records for these datasets are still removed, 

1662 but any artifacts (e.g. files) will not be. 

1663 

1664 Raises 

1665 ------ 

1666 TypeError 

1667 Raised if one or more collections are not of type 

1668 `~CollectionType.RUN`. 

1669 """ 

1670 if not self.isWriteable(): 

1671 raise TypeError("Butler is read-only.") 

1672 names = list(names) 

1673 refs: List[DatasetRef] = [] 

1674 for name in names: 

1675 collectionType = self.registry.getCollectionType(name) 

1676 if collectionType is not CollectionType.RUN: 

1677 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1678 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1679 with self.datastore.transaction(): 

1680 with self.registry.transaction(): 

1681 if unstore: 

1682 self.datastore.trash(refs) 

1683 else: 

1684 self.datastore.forget(refs) 

1685 for name in names: 

1686 self.registry.removeCollection(name) 

1687 if unstore: 

1688 # Point of no return for removing artifacts 

1689 self.datastore.emptyTrash() 

1690 

1691 def pruneDatasets( 

1692 self, 

1693 refs: Iterable[DatasetRef], 

1694 *, 

1695 disassociate: bool = True, 

1696 unstore: bool = False, 

1697 tags: Iterable[str] = (), 

1698 purge: bool = False, 

1699 ) -> None: 

1700 # docstring inherited from LimitedButler 

1701 

1702 if not self.isWriteable(): 

1703 raise TypeError("Butler is read-only.") 

1704 if purge: 

1705 if not disassociate: 

1706 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1707 if not unstore: 

1708 raise TypeError("Cannot pass purge=True without unstore=True.") 

1709 elif disassociate: 

1710 tags = tuple(tags) 

1711 if not tags: 

1712 raise TypeError("No tags provided but disassociate=True.") 

1713 for tag in tags: 

1714 collectionType = self.registry.getCollectionType(tag) 

1715 if collectionType is not CollectionType.TAGGED: 

1716 raise TypeError( 

1717 f"Cannot disassociate from collection '{tag}' " 

1718 f"of non-TAGGED type {collectionType.name}." 

1719 ) 

1720 # Transform possibly-single-pass iterable into something we can iterate 

1721 # over multiple times. 

1722 refs = list(refs) 

1723 # Pruning a component of a DatasetRef makes no sense since registry 

1724 # doesn't know about components and datastore might not store 

1725 # components in a separate file 

1726 for ref in refs: 

1727 if ref.datasetType.component(): 

1728 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1729 # We don't need an unreliable Datastore transaction for this, because 

1730 # we've been extra careful to ensure that Datastore.trash only involves 

1731 # mutating the Registry (it can _look_ at Datastore-specific things, 

1732 # but shouldn't change them), and hence all operations here are 

1733 # Registry operations. 

1734 with self.datastore.transaction(): 

1735 with self.registry.transaction(): 

1736 if unstore: 

1737 self.datastore.trash(refs) 

1738 if purge: 

1739 self.registry.removeDatasets(refs) 

1740 elif disassociate: 

1741 assert tags, "Guaranteed by earlier logic in this function." 

1742 for tag in tags: 

1743 self.registry.disassociate(tag, refs) 

1744 # We've exited the Registry transaction, and apparently committed. 

1745 # (if there was an exception, everything rolled back, and it's as if 

1746 # nothing happened - and we never get here). 

1747 # Datastore artifacts are not yet gone, but they're clearly marked 

1748 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1749 # problems we can try again later, and if manual administrative 

1750 # intervention is required, it's pretty clear what that should entail: 

1751 # deleting everything on disk and in private Datastore tables that is 

1752 # in the dataset_location_trash table. 

1753 if unstore: 

1754 # Point of no return for removing artifacts 

1755 self.datastore.emptyTrash() 

1756 

1757 @transactional 

1758 def ingest( 

1759 self, 

1760 *datasets: FileDataset, 

1761 transfer: Optional[str] = "auto", 

1762 run: Optional[str] = None, 

1763 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1764 record_validation_info: bool = True, 

1765 ) -> None: 

1766 """Store and register one or more datasets that already exist on disk. 

1767 

1768 Parameters 

1769 ---------- 

1770 datasets : `FileDataset` 

1771 Each positional argument is a struct containing information about 

1772 a file to be ingested, including its URI (either absolute or 

1773 relative to the datastore root, if applicable), a resolved 

1774 `DatasetRef`, and optionally a formatter class or its 

1775 fully-qualified string name. If a formatter is not provided, the 

1776 formatter that would be used for `put` is assumed. On successful 

1777 ingest all `FileDataset.formatter` attributes will be set to the 

1778 formatter class used. `FileDataset.path` attributes may be modified 

1779 to put paths in whatever the datastore considers a standardized 

1780 form. 

1781 transfer : `str`, optional 

1782 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1783 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1784 transfer the file. 

1785 run : `str`, optional 

1786 The name of the run ingested datasets should be added to, 

1787 overriding ``self.run``. This parameter is now deprecated since 

1788 the run is encoded in the ``FileDataset``. 

1789 idGenerationMode : `DatasetIdGenEnum`, optional 

1790 Specifies option for generating dataset IDs. By default unique IDs 

1791 are generated for each inserted dataset. 

1792 record_validation_info : `bool`, optional 

1793 If `True`, the default, the datastore can record validation 

1794 information associated with the file. If `False` the datastore 

1795 will not attempt to track any information such as checksums 

1796 or file sizes. This can be useful if such information is tracked 

1797 in an external system or if the file is to be compressed in place. 

1798 It is up to the datastore whether this parameter is relevant. 

1799 

1800 Raises 

1801 ------ 

1802 TypeError 

1803 Raised if the butler is read-only or if no run was provided. 

1804 NotImplementedError 

1805 Raised if the `Datastore` does not support the given transfer mode. 

1806 DatasetTypeNotSupportedError 

1807 Raised if one or more files to be ingested have a dataset type that 

1808 is not supported by the `Datastore`.. 

1809 FileNotFoundError 

1810 Raised if one of the given files does not exist. 

1811 FileExistsError 

1812 Raised if transfer is not `None` but the (internal) location the 

1813 file would be moved to is already occupied. 

1814 

1815 Notes 

1816 ----- 

1817 This operation is not fully exception safe: if a database operation 

1818 fails, the given `FileDataset` instances may be only partially updated. 

1819 

1820 It is atomic in terms of database operations (they will either all 

1821 succeed or all fail) providing the database engine implements 

1822 transactions correctly. It will attempt to be atomic in terms of 

1823 filesystem operations as well, but this cannot be implemented 

1824 rigorously for most datastores. 

1825 """ 

1826 if not self.isWriteable(): 

1827 raise TypeError("Butler is read-only.") 

1828 

1829 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1830 if not datasets: 

1831 return 

1832 

1833 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1834 

1835 # We need to reorganize all the inputs so that they are grouped 

1836 # by dataset type and run. Multiple refs in a single FileDataset 

1837 # are required to share the run and dataset type. 

1838 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

1839 groupedData: GroupedData = defaultdict(list) 

1840 

1841 # Track DataIDs that are being ingested so we can spot issues early 

1842 # with duplication. Retain previous FileDataset so we can report it. 

1843 groupedDataIds: MutableMapping[ 

1844 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1845 ] = defaultdict(dict) 

1846 

1847 used_run = False 

1848 

1849 # And the nested loop that populates it: 

1850 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1851 # Somewhere to store pre-existing refs if we have an 

1852 # execution butler. 

1853 existingRefs: List[DatasetRef] = [] 

1854 

1855 for ref in dataset.refs: 

1856 assert ref.run is not None # For mypy 

1857 group_key = (ref.datasetType, ref.run) 

1858 

1859 if ref.dataId in groupedDataIds[group_key]: 

1860 raise ConflictingDefinitionError( 

1861 f"Ingest conflict. Dataset {dataset.path} has same" 

1862 " DataId as other ingest dataset" 

1863 f" {groupedDataIds[group_key][ref.dataId].path} " 

1864 f" ({ref.dataId})" 

1865 ) 

1866 

1867 groupedDataIds[group_key][ref.dataId] = dataset 

1868 

1869 if existingRefs: 

1870 if len(dataset.refs) != len(existingRefs): 

1871 # Keeping track of partially pre-existing datasets is hard 

1872 # and should generally never happen. For now don't allow 

1873 # it. 

1874 raise ConflictingDefinitionError( 

1875 f"For dataset {dataset.path} some dataIds already exist" 

1876 " in registry but others do not. This is not supported." 

1877 ) 

1878 

1879 # Store expanded form in the original FileDataset. 

1880 dataset.refs = existingRefs 

1881 else: 

1882 groupedData[group_key].append(dataset) 

1883 

1884 if not used_run and run is not None: 

1885 warnings.warn( 

1886 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

1887 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

1888 category=FutureWarning, 

1889 stacklevel=3, # Take into account the @transactional decorator. 

1890 ) 

1891 

1892 # Now we can bulk-insert into Registry for each DatasetType. 

1893 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1894 groupedData.items(), desc="Bulk-inserting datasets by type" 

1895 ): 

1896 refs_to_import = [] 

1897 for dataset in grouped_datasets: 

1898 refs_to_import.extend(dataset.refs) 

1899 

1900 n_refs = len(refs_to_import) 

1901 log.verbose( 

1902 "Importing %d ref%s of dataset type %r into run %r", 

1903 n_refs, 

1904 "" if n_refs == 1 else "s", 

1905 datasetType.name, 

1906 this_run, 

1907 ) 

1908 

1909 # Import the refs and expand the DataCoordinates since we can't 

1910 # guarantee that they are expanded and Datastore will need 

1911 # the records. 

1912 imported_refs = self.registry._importDatasets(refs_to_import, expand=True) 

1913 assert set(imported_refs) == set(refs_to_import) 

1914 

1915 # Replace all the refs in the FileDataset with expanded versions. 

1916 # Pull them off in the order we put them on the list. 

1917 for dataset in grouped_datasets: 

1918 n_dataset_refs = len(dataset.refs) 

1919 dataset.refs = imported_refs[:n_dataset_refs] 

1920 del imported_refs[:n_dataset_refs] 

1921 

1922 # Bulk-insert everything into Datastore. 

1923 # We do not know if any of the registry entries already existed 

1924 # (_importDatasets only complains if they exist but differ) so 

1925 # we have to catch IntegrityError explicitly. 

1926 try: 

1927 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1928 except IntegrityError as e: 

1929 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") 

1930 

1931 @contextlib.contextmanager 

1932 def export( 

1933 self, 

1934 *, 

1935 directory: Optional[str] = None, 

1936 filename: Optional[str] = None, 

1937 format: Optional[str] = None, 

1938 transfer: Optional[str] = None, 

1939 ) -> Iterator[RepoExportContext]: 

1940 """Export datasets from the repository represented by this `Butler`. 

1941 

1942 This method is a context manager that returns a helper object 

1943 (`RepoExportContext`) that is used to indicate what information from 

1944 the repository should be exported. 

1945 

1946 Parameters 

1947 ---------- 

1948 directory : `str`, optional 

1949 Directory dataset files should be written to if ``transfer`` is not 

1950 `None`. 

1951 filename : `str`, optional 

1952 Name for the file that will include database information associated 

1953 with the exported datasets. If this is not an absolute path and 

1954 ``directory`` is not `None`, it will be written to ``directory`` 

1955 instead of the current working directory. Defaults to 

1956 "export.{format}". 

1957 format : `str`, optional 

1958 File format for the database information file. If `None`, the 

1959 extension of ``filename`` will be used. 

1960 transfer : `str`, optional 

1961 Transfer mode passed to `Datastore.export`. 

1962 

1963 Raises 

1964 ------ 

1965 TypeError 

1966 Raised if the set of arguments passed is inconsistent. 

1967 

1968 Examples 

1969 -------- 

1970 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1971 methods are used to provide the iterables over data IDs and/or datasets 

1972 to be exported:: 

1973 

1974 with butler.export("exports.yaml") as export: 

1975 # Export all flats, but none of the dimension element rows 

1976 # (i.e. data ID information) associated with them. 

1977 export.saveDatasets(butler.registry.queryDatasets("flat"), 

1978 elements=()) 

1979 # Export all datasets that start with "deepCoadd_" and all of 

1980 # their associated data ID information. 

1981 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

1982 """ 

1983 if directory is None and transfer is not None: 

1984 raise TypeError("Cannot transfer without providing a directory.") 

1985 if transfer == "move": 

1986 raise TypeError("Transfer may not be 'move': export is read-only") 

1987 if format is None: 

1988 if filename is None: 

1989 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1990 else: 

1991 _, format = os.path.splitext(filename) 

1992 if not format: 

1993 raise ValueError("Please specify a file extension to determine export format.") 

1994 format = format[1:] # Strip leading "."" 

1995 elif filename is None: 

1996 filename = f"export.{format}" 

1997 if directory is not None: 

1998 filename = os.path.join(directory, filename) 

1999 formats = self._config["repo_transfer_formats"] 

2000 if format not in formats: 

2001 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2002 BackendClass = get_class_of(formats[format, "export"]) 

2003 with open(filename, "w") as stream: 

2004 backend = BackendClass(stream, universe=self.registry.dimensions) 

2005 try: 

2006 helper = RepoExportContext( 

2007 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2008 ) 

2009 yield helper 

2010 except BaseException: 

2011 raise 

2012 else: 

2013 helper._finish() 

2014 

2015 def import_( 

2016 self, 

2017 *, 

2018 directory: Optional[ResourcePathExpression] = None, 

2019 filename: Union[ResourcePathExpression, TextIO, None] = None, 

2020 format: Optional[str] = None, 

2021 transfer: Optional[str] = None, 

2022 skip_dimensions: Optional[Set] = None, 

2023 ) -> None: 

2024 """Import datasets into this repository that were exported from a 

2025 different butler repository via `~lsst.daf.butler.Butler.export`. 

2026 

2027 Parameters 

2028 ---------- 

2029 directory : `~lsst.resources.ResourcePathExpression`, optional 

2030 Directory containing dataset files to import from. If `None`, 

2031 ``filename`` and all dataset file paths specified therein must 

2032 be absolute. 

2033 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2034 A stream or name of file that contains database information 

2035 associated with the exported datasets, typically generated by 

2036 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2037 `~lsst.resources.ResourcePath` and is not an absolute path, 

2038 it will first be looked for relative to ``directory`` and if not 

2039 found there it will be looked for in the current working 

2040 directory. Defaults to "export.{format}". 

2041 format : `str`, optional 

2042 File format for ``filename``. If `None`, the extension of 

2043 ``filename`` will be used. 

2044 transfer : `str`, optional 

2045 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2046 skip_dimensions : `set`, optional 

2047 Names of dimensions that should be skipped and not imported. 

2048 

2049 Raises 

2050 ------ 

2051 TypeError 

2052 Raised if the set of arguments passed is inconsistent, or if the 

2053 butler is read-only. 

2054 """ 

2055 if not self.isWriteable(): 

2056 raise TypeError("Butler is read-only.") 

2057 if format is None: 

2058 if filename is None: 

2059 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2060 else: 

2061 _, format = os.path.splitext(filename) # type: ignore 

2062 elif filename is None: 

2063 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2064 if directory is not None: 

2065 directory = ResourcePath(directory, forceDirectory=True) 

2066 # mypy doesn't think this will work but it does in python >= 3.10. 

2067 if isinstance(filename, ResourcePathExpression): # type: ignore 

2068 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2069 if not filename.isabs() and directory is not None: 

2070 potential = directory.join(filename) 

2071 exists_in_cwd = filename.exists() 

2072 exists_in_dir = potential.exists() 

2073 if exists_in_cwd and exists_in_dir: 

2074 log.warning( 

2075 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2076 "Additionally, the file exists relative to the given search directory (%s). " 

2077 "Using the export file in the given directory.", 

2078 filename, 

2079 potential, 

2080 ) 

2081 # Given they specified an explicit directory and that 

2082 # directory has the export file in it, assume that that 

2083 # is what was meant despite the file in cwd. 

2084 filename = potential 

2085 elif exists_in_dir: 

2086 filename = potential 

2087 elif not exists_in_cwd and not exists_in_dir: 

2088 # Raise early. 

2089 raise FileNotFoundError( 

2090 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2091 ) 

2092 BackendClass: type[RepoImportBackend] = get_class_of( 

2093 self._config["repo_transfer_formats"][format]["import"] 

2094 ) 

2095 

2096 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2097 backend = BackendClass(importStream, self.registry) # type: ignore[call-arg] 

2098 backend.register() 

2099 with self.transaction(): 

2100 backend.load( 

2101 self.datastore, 

2102 directory=directory, 

2103 transfer=transfer, 

2104 skip_dimensions=skip_dimensions, 

2105 ) 

2106 

2107 if isinstance(filename, ResourcePath): 

2108 # We can not use open() here at the moment because of 

2109 # DM-38589 since yaml does stream.read(8192) in a loop. 

2110 stream = io.StringIO(filename.read().decode()) 

2111 doImport(stream) 

2112 else: 

2113 doImport(filename) # type: ignore 

2114 

2115 def transfer_from( 

2116 self, 

2117 source_butler: LimitedButler, 

2118 source_refs: Iterable[DatasetRef], 

2119 transfer: str = "auto", 

2120 skip_missing: bool = True, 

2121 register_dataset_types: bool = False, 

2122 transfer_dimensions: bool = False, 

2123 ) -> collections.abc.Collection[DatasetRef]: 

2124 """Transfer datasets to this Butler from a run in another Butler. 

2125 

2126 Parameters 

2127 ---------- 

2128 source_butler : `LimitedButler` 

2129 Butler from which the datasets are to be transferred. If data IDs 

2130 in ``source_refs`` are not expanded then this has to be a full 

2131 `Butler` whose registry will be used to expand data IDs. 

2132 source_refs : iterable of `DatasetRef` 

2133 Datasets defined in the source butler that should be transferred to 

2134 this butler. 

2135 transfer : `str`, optional 

2136 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2137 skip_missing : `bool` 

2138 If `True`, datasets with no datastore artifact associated with 

2139 them are not transferred. If `False` a registry entry will be 

2140 created even if no datastore record is created (and so will 

2141 look equivalent to the dataset being unstored). 

2142 register_dataset_types : `bool` 

2143 If `True` any missing dataset types are registered. Otherwise 

2144 an exception is raised. 

2145 transfer_dimensions : `bool`, optional 

2146 If `True`, dimension record data associated with the new datasets 

2147 will be transferred. 

2148 

2149 Returns 

2150 ------- 

2151 refs : `list` of `DatasetRef` 

2152 The refs added to this Butler. 

2153 

2154 Notes 

2155 ----- 

2156 The datastore artifact has to exist for a transfer 

2157 to be made but non-existence is not an error. 

2158 

2159 Datasets that already exist in this run will be skipped. 

2160 

2161 The datasets are imported as part of a transaction, although 

2162 dataset types are registered before the transaction is started. 

2163 This means that it is possible for a dataset type to be registered 

2164 even though transfer has failed. 

2165 """ 

2166 if not self.isWriteable(): 

2167 raise TypeError("Butler is read-only.") 

2168 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2169 

2170 # Will iterate through the refs multiple times so need to convert 

2171 # to a list if this isn't a collection. 

2172 if not isinstance(source_refs, collections.abc.Collection): 

2173 source_refs = list(source_refs) 

2174 

2175 original_count = len(source_refs) 

2176 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2177 

2178 # In some situations the datastore artifact may be missing 

2179 # and we do not want that registry entry to be imported. 

2180 # Asking datastore is not sufficient, the records may have been 

2181 # purged, we have to ask for the (predicted) URI and check 

2182 # existence explicitly. Execution butler is set up exactly like 

2183 # this with no datastore records. 

2184 artifact_existence: Dict[ResourcePath, bool] = {} 

2185 if skip_missing: 

2186 dataset_existence = source_butler.datastore.mexists( 

2187 source_refs, artifact_existence=artifact_existence 

2188 ) 

2189 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2190 filtered_count = len(source_refs) 

2191 n_missing = original_count - filtered_count 

2192 log.verbose( 

2193 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2194 n_missing, 

2195 "" if n_missing == 1 else "s", 

2196 filtered_count, 

2197 ) 

2198 

2199 # Importing requires that we group the refs by dataset type and run 

2200 # before doing the import. 

2201 source_dataset_types = set() 

2202 grouped_refs = defaultdict(list) 

2203 for ref in source_refs: 

2204 grouped_refs[ref.datasetType, ref.run].append(ref) 

2205 source_dataset_types.add(ref.datasetType) 

2206 

2207 # Check to see if the dataset type in the source butler has 

2208 # the same definition in the target butler and register missing 

2209 # ones if requested. Registration must happen outside a transaction. 

2210 newly_registered_dataset_types = set() 

2211 for datasetType in source_dataset_types: 

2212 if register_dataset_types: 

2213 # Let this raise immediately if inconsistent. Continuing 

2214 # on to find additional inconsistent dataset types 

2215 # might result in additional unwanted dataset types being 

2216 # registered. 

2217 if self.registry.registerDatasetType(datasetType): 

2218 newly_registered_dataset_types.add(datasetType) 

2219 else: 

2220 # If the dataset type is missing, let it fail immediately. 

2221 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2222 if target_dataset_type != datasetType: 

2223 raise ConflictingDefinitionError( 

2224 "Source butler dataset type differs from definition" 

2225 f" in target butler: {datasetType} !=" 

2226 f" {target_dataset_type}" 

2227 ) 

2228 if newly_registered_dataset_types: 

2229 # We may have registered some even if there were inconsistencies 

2230 # but should let people know (or else remove them again). 

2231 log.log( 

2232 VERBOSE, 

2233 "Registered the following dataset types in the target Butler: %s", 

2234 ", ".join(d.name for d in newly_registered_dataset_types), 

2235 ) 

2236 else: 

2237 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2238 

2239 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2240 if transfer_dimensions: 

2241 # Collect all the dimension records for these refs. 

2242 # All dimensions are to be copied but the list of valid dimensions 

2243 # come from this butler's universe. 

2244 elements = frozenset( 

2245 element 

2246 for element in self.registry.dimensions.getStaticElements() 

2247 if element.hasTable() and element.viewOf is None 

2248 ) 

2249 dataIds = set(ref.dataId for ref in source_refs) 

2250 # This logic comes from saveDataIds. 

2251 for dataId in dataIds: 

2252 # Need an expanded record, if not expanded that we need a full 

2253 # butler with registry (allow mocks with registry too). 

2254 if not dataId.hasRecords(): 

2255 if registry := getattr(source_butler, "registry", None): 

2256 dataId = registry.expandDataId(dataId) 

2257 else: 

2258 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2259 # If this butler doesn't know about a dimension in the source 

2260 # butler things will break later. 

2261 for record in dataId.records.values(): 

2262 if record is not None and record.definition in elements: 

2263 dimension_records[record.definition].setdefault(record.dataId, record) 

2264 

2265 handled_collections: Set[str] = set() 

2266 

2267 # Do all the importing in a single transaction. 

2268 with self.transaction(): 

2269 if dimension_records: 

2270 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2271 for element, r in dimension_records.items(): 

2272 records = [r[dataId] for dataId in r] 

2273 # Assume that if the record is already present that we can 

2274 # use it without having to check that the record metadata 

2275 # is consistent. 

2276 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2277 

2278 n_imported = 0 

2279 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2280 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2281 ): 

2282 if run not in handled_collections: 

2283 # May need to create output collection. If source butler 

2284 # has a registry, ask for documentation string. 

2285 run_doc = None 

2286 if registry := getattr(source_butler, "registry", None): 

2287 run_doc = registry.getCollectionDocumentation(run) 

2288 registered = self.registry.registerRun(run, doc=run_doc) 

2289 handled_collections.add(run) 

2290 if registered: 

2291 log.log(VERBOSE, "Creating output run %s", run) 

2292 

2293 n_refs = len(refs_to_import) 

2294 log.verbose( 

2295 "Importing %d ref%s of dataset type %s into run %s", 

2296 n_refs, 

2297 "" if n_refs == 1 else "s", 

2298 datasetType.name, 

2299 run, 

2300 ) 

2301 

2302 # Assume we are using UUIDs and the source refs will match 

2303 # those imported. 

2304 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2305 assert set(imported_refs) == set(refs_to_import) 

2306 n_imported += len(imported_refs) 

2307 

2308 assert len(source_refs) == n_imported 

2309 log.verbose("Imported %d datasets into destination butler", n_imported) 

2310 

2311 # Ask the datastore to transfer. The datastore has to check that 

2312 # the source datastore is compatible with the target datastore. 

2313 accepted, rejected = self.datastore.transfer_from( 

2314 source_butler.datastore, 

2315 source_refs, 

2316 transfer=transfer, 

2317 artifact_existence=artifact_existence, 

2318 ) 

2319 if rejected: 

2320 # For now, accept the registry entries but not the files. 

2321 log.warning( 

2322 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2323 len(rejected), 

2324 len(accepted), 

2325 datasetType, 

2326 run, 

2327 ) 

2328 

2329 return source_refs 

2330 

2331 def validateConfiguration( 

2332 self, 

2333 logFailures: bool = False, 

2334 datasetTypeNames: Optional[Iterable[str]] = None, 

2335 ignore: Iterable[str] | None = None, 

2336 ) -> None: 

2337 """Validate butler configuration. 

2338 

2339 Checks that each `DatasetType` can be stored in the `Datastore`. 

2340 

2341 Parameters 

2342 ---------- 

2343 logFailures : `bool`, optional 

2344 If `True`, output a log message for every validation error 

2345 detected. 

2346 datasetTypeNames : iterable of `str`, optional 

2347 The `DatasetType` names that should be checked. This allows 

2348 only a subset to be selected. 

2349 ignore : iterable of `str`, optional 

2350 Names of DatasetTypes to skip over. This can be used to skip 

2351 known problems. If a named `DatasetType` corresponds to a 

2352 composite, all components of that `DatasetType` will also be 

2353 ignored. 

2354 

2355 Raises 

2356 ------ 

2357 ButlerValidationError 

2358 Raised if there is some inconsistency with how this Butler 

2359 is configured. 

2360 """ 

2361 if datasetTypeNames: 

2362 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2363 else: 

2364 datasetTypes = list(self.registry.queryDatasetTypes()) 

2365 

2366 # filter out anything from the ignore list 

2367 if ignore: 

2368 ignore = set(ignore) 

2369 datasetTypes = [ 

2370 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2371 ] 

2372 else: 

2373 ignore = set() 

2374 

2375 # Find all the registered instruments 

2376 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2377 

2378 # For each datasetType that has an instrument dimension, create 

2379 # a DatasetRef for each defined instrument 

2380 datasetRefs = [] 

2381 

2382 for datasetType in datasetTypes: 

2383 if "instrument" in datasetType.dimensions: 

2384 for instrument in instruments: 

2385 datasetRef = DatasetRef( 

2386 datasetType, 

2387 {"instrument": instrument}, # type: ignore 

2388 conform=False, 

2389 run="validate", 

2390 ) 

2391 datasetRefs.append(datasetRef) 

2392 

2393 entities: List[Union[DatasetType, DatasetRef]] = [] 

2394 entities.extend(datasetTypes) 

2395 entities.extend(datasetRefs) 

2396 

2397 datastoreErrorStr = None 

2398 try: 

2399 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2400 except ValidationError as e: 

2401 datastoreErrorStr = str(e) 

2402 

2403 # Also check that the LookupKeys used by the datastores match 

2404 # registry and storage class definitions 

2405 keys = self.datastore.getLookupKeys() 

2406 

2407 failedNames = set() 

2408 failedDataId = set() 

2409 for key in keys: 

2410 if key.name is not None: 

2411 if key.name in ignore: 

2412 continue 

2413 

2414 # skip if specific datasetType names were requested and this 

2415 # name does not match 

2416 if datasetTypeNames and key.name not in datasetTypeNames: 

2417 continue 

2418 

2419 # See if it is a StorageClass or a DatasetType 

2420 if key.name in self.storageClasses: 

2421 pass 

2422 else: 

2423 try: 

2424 self.registry.getDatasetType(key.name) 

2425 except KeyError: 

2426 if logFailures: 

2427 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2428 failedNames.add(key) 

2429 else: 

2430 # Dimensions are checked for consistency when the Butler 

2431 # is created and rendezvoused with a universe. 

2432 pass 

2433 

2434 # Check that the instrument is a valid instrument 

2435 # Currently only support instrument so check for that 

2436 if key.dataId: 

2437 dataIdKeys = set(key.dataId) 

2438 if set(["instrument"]) != dataIdKeys: 

2439 if logFailures: 

2440 log.critical("Key '%s' has unsupported DataId override", key) 

2441 failedDataId.add(key) 

2442 elif key.dataId["instrument"] not in instruments: 

2443 if logFailures: 

2444 log.critical("Key '%s' has unknown instrument", key) 

2445 failedDataId.add(key) 

2446 

2447 messages = [] 

2448 

2449 if datastoreErrorStr: 

2450 messages.append(datastoreErrorStr) 

2451 

2452 for failed, msg in ( 

2453 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2454 (failedDataId, "Keys with bad DataId entries: "), 

2455 ): 

2456 if failed: 

2457 msg += ", ".join(str(k) for k in failed) 

2458 messages.append(msg) 

2459 

2460 if messages: 

2461 raise ValidationError(";\n".join(messages)) 

2462 

2463 @property 

2464 def collections(self) -> Sequence[str]: 

2465 """The collections to search by default, in order 

2466 (`Sequence` [ `str` ]). 

2467 

2468 This is an alias for ``self.registry.defaults.collections``. It cannot 

2469 be set directly in isolation, but all defaults may be changed together 

2470 by assigning a new `RegistryDefaults` instance to 

2471 ``self.registry.defaults``. 

2472 """ 

2473 return self.registry.defaults.collections 

2474 

2475 @property 

2476 def run(self) -> Optional[str]: 

2477 """Name of the run this butler writes outputs to by default (`str` or 

2478 `None`). 

2479 

2480 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2481 directly in isolation, but all defaults may be changed together by 

2482 assigning a new `RegistryDefaults` instance to 

2483 ``self.registry.defaults``. 

2484 """ 

2485 return self.registry.defaults.run 

2486 

2487 @property 

2488 def dimensions(self) -> DimensionUniverse: 

2489 # Docstring inherited. 

2490 return self.registry.dimensions 

2491 

2492 registry: Registry 

2493 """The object that manages dataset metadata and relationships (`Registry`). 

2494 

2495 Most operations that don't involve reading or writing butler datasets are 

2496 accessible only via `Registry` methods. 

2497 """ 

2498 

2499 datastore: Datastore 

2500 """The object that manages actual dataset storage (`Datastore`). 

2501 

2502 Direct user access to the datastore should rarely be necessary; the primary 

2503 exception is the case where a `Datastore` implementation provides extra 

2504 functionality beyond what the base class defines. 

2505 """ 

2506 

2507 storageClasses: StorageClassFactory 

2508 """An object that maps known storage class names to objects that fully 

2509 describe them (`StorageClassFactory`). 

2510 """