Coverage for python/lsst/daf/butler/_butler.py: 8%

723 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-08 05:05 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30) 

31 

32import collections.abc 

33import contextlib 

34import io 

35import logging 

36import numbers 

37import os 

38import warnings 

39from collections import defaultdict 

40from typing import ( 

41 TYPE_CHECKING, 

42 Any, 

43 ClassVar, 

44 Counter, 

45 Dict, 

46 Iterable, 

47 Iterator, 

48 List, 

49 MutableMapping, 

50 Optional, 

51 Sequence, 

52 Set, 

53 TextIO, 

54 Tuple, 

55 Type, 

56 Union, 

57) 

58 

59from deprecated.sphinx import deprecated 

60from lsst.resources import ResourcePath, ResourcePathExpression 

61from lsst.utils import doImportType 

62from lsst.utils.introspection import get_class_of 

63from lsst.utils.logging import VERBOSE, getLogger 

64from sqlalchemy.exc import IntegrityError 

65 

66from ._butlerConfig import ButlerConfig 

67from ._butlerRepoIndex import ButlerRepoIndex 

68from ._dataset_existence import DatasetExistence 

69from ._deferredDatasetHandle import DeferredDatasetHandle 

70from ._limited_butler import LimitedButler 

71from .core import ( 

72 Config, 

73 ConfigSubset, 

74 DataCoordinate, 

75 DataId, 

76 DataIdValue, 

77 DatasetIdGenEnum, 

78 DatasetRef, 

79 DatasetRefURIs, 

80 DatasetType, 

81 Datastore, 

82 Dimension, 

83 DimensionConfig, 

84 DimensionElement, 

85 DimensionRecord, 

86 DimensionUniverse, 

87 FileDataset, 

88 Progress, 

89 StorageClass, 

90 StorageClassFactory, 

91 Timespan, 

92 ValidationError, 

93) 

94from .core.repoRelocation import BUTLER_ROOT_TAG 

95from .core.utils import transactional 

96from .registry import ( 

97 CollectionType, 

98 ConflictingDefinitionError, 

99 DataIdError, 

100 MissingDatasetTypeError, 

101 NoDefaultCollectionError, 

102 Registry, 

103 RegistryConfig, 

104 RegistryDefaults, 

105) 

106from .transfers import RepoExportContext 

107 

108if TYPE_CHECKING: 

109 from lsst.resources import ResourceHandleProtocol 

110 

111 from .transfers import RepoImportBackend 

112 

113log = getLogger(__name__) 

114 

115 

116class ButlerValidationError(ValidationError): 

117 """There is a problem with the Butler configuration.""" 

118 

119 pass 

120 

121 

122class Butler(LimitedButler): 

123 """Main entry point for the data access system. 

124 

125 Parameters 

126 ---------- 

127 config : `ButlerConfig`, `Config` or `str`, optional. 

128 Configuration. Anything acceptable to the 

129 `ButlerConfig` constructor. If a directory path 

130 is given the configuration will be read from a ``butler.yaml`` file in 

131 that location. If `None` is given default values will be used. 

132 butler : `Butler`, optional. 

133 If provided, construct a new Butler that uses the same registry and 

134 datastore as the given one, but with the given collection and run. 

135 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

136 arguments. 

137 collections : `str` or `Iterable` [ `str` ], optional 

138 An expression specifying the collections to be searched (in order) when 

139 reading datasets. 

140 This may be a `str` collection name or an iterable thereof. 

141 See :ref:`daf_butler_collection_expressions` for more information. 

142 These collections are not registered automatically and must be 

143 manually registered before they are used by any method, but they may be 

144 manually registered after the `Butler` is initialized. 

145 run : `str`, optional 

146 Name of the `~CollectionType.RUN` collection new datasets should be 

147 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

148 ``collections`` will be set to ``[run]``. If not `None`, this 

149 collection will automatically be registered. If this is not set (and 

150 ``writeable`` is not set either), a read-only butler will be created. 

151 searchPaths : `list` of `str`, optional 

152 Directory paths to search when calculating the full Butler 

153 configuration. Not used if the supplied config is already a 

154 `ButlerConfig`. 

155 writeable : `bool`, optional 

156 Explicitly sets whether the butler supports write operations. If not 

157 provided, a read-write butler is created if any of ``run``, ``tags``, 

158 or ``chains`` is non-empty. 

159 inferDefaults : `bool`, optional 

160 If `True` (default) infer default data ID values from the values 

161 present in the datasets in ``collections``: if all collections have the 

162 same value (or no value) for a governor dimension, that value will be 

163 the default for that dimension. Nonexistent collections are ignored. 

164 If a default value is provided explicitly for a governor dimension via 

165 ``**kwargs``, no default will be inferred for that dimension. 

166 **kwargs : `str` 

167 Default data ID key-value pairs. These may only identify "governor" 

168 dimensions like ``instrument`` and ``skymap``. 

169 

170 Examples 

171 -------- 

172 While there are many ways to control exactly how a `Butler` interacts with 

173 the collections in its `Registry`, the most common cases are still simple. 

174 

175 For a read-only `Butler` that searches one collection, do:: 

176 

177 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

178 

179 For a read-write `Butler` that writes to and reads from a 

180 `~CollectionType.RUN` collection:: 

181 

182 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

183 

184 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

185 because we want to write to one `~CollectionType.RUN` collection but read 

186 from several others (as well):: 

187 

188 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

189 collections=["u/alice/DM-50000/a", 

190 "u/bob/DM-49998", 

191 "HSC/defaults"]) 

192 

193 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

194 Datasets will be read first from that run (since it appears first in the 

195 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

196 

197 Finally, one can always create a `Butler` with no collections:: 

198 

199 butler = Butler("/path/to/repo", writeable=True) 

200 

201 This can be extremely useful when you just want to use ``butler.registry``, 

202 e.g. for inserting dimension data or managing collections, or when the 

203 collections you want to use with the butler are not consistent. 

204 Passing ``writeable`` explicitly here is only necessary if you want to be 

205 able to make changes to the repo - usually the value for ``writeable`` can 

206 be guessed from the collection arguments provided, but it defaults to 

207 `False` when there are not collection arguments. 

208 """ 

209 

210 def __init__( 

211 self, 

212 config: Union[Config, ResourcePathExpression, None] = None, 

213 *, 

214 butler: Optional[Butler] = None, 

215 collections: Any = None, 

216 run: Optional[str] = None, 

217 searchPaths: Optional[Sequence[ResourcePathExpression]] = None, 

218 writeable: Optional[bool] = None, 

219 inferDefaults: bool = True, 

220 **kwargs: str, 

221 ): 

222 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

223 # Load registry, datastore, etc. from config or existing butler. 

224 if butler is not None: 

225 if config is not None or searchPaths is not None or writeable is not None: 

226 raise TypeError( 

227 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

228 ) 

229 self.registry = butler.registry.copy(defaults) 

230 self.datastore = butler.datastore 

231 self.storageClasses = butler.storageClasses 

232 self._config: ButlerConfig = butler._config 

233 else: 

234 # Can only look for strings in the known repos list. 

235 if isinstance(config, str): 

236 # Somehow ButlerConfig fails in some cases if config is a 

237 # ResourcePath, force it back to string here. 

238 config = str(self.get_repo_uri(config, True)) 

239 try: 

240 self._config = ButlerConfig(config, searchPaths=searchPaths) 

241 except FileNotFoundError as e: 

242 if known := self.get_known_repos(): 

243 aliases = f"(known aliases: {', '.join(known)})" 

244 else: 

245 aliases = "(no known aliases)" 

246 raise FileNotFoundError(f"{e} {aliases}") from e 

247 try: 

248 if "root" in self._config: 

249 butlerRoot = self._config["root"] 

250 else: 

251 butlerRoot = self._config.configDir 

252 if writeable is None: 

253 writeable = run is not None 

254 self.registry = Registry.fromConfig( 

255 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

256 ) 

257 self.datastore = Datastore.fromConfig( 

258 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

259 ) 

260 self.storageClasses = StorageClassFactory() 

261 self.storageClasses.addFromConfig(self._config) 

262 except Exception: 

263 # Failures here usually mean that configuration is incomplete, 

264 # just issue an error message which includes config file URI. 

265 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

266 raise 

267 

268 # For execution butler the datastore needs a special 

269 # dependency-inversion trick. This is not used by regular butler, 

270 # but we do not have a way to distinguish regular butler from execution 

271 # butler. 

272 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

273 

274 if "run" in self._config or "collection" in self._config: 

275 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

276 

277 GENERATION: ClassVar[int] = 3 

278 """This is a Generation 3 Butler. 

279 

280 This attribute may be removed in the future, once the Generation 2 Butler 

281 interface has been fully retired; it should only be used in transitional 

282 code. 

283 """ 

284 

285 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

286 """Return DatasetType defined in registry given dataset type name.""" 

287 try: 

288 return self.registry.getDatasetType(name) 

289 except MissingDatasetTypeError: 

290 return None 

291 

292 @classmethod 

293 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: 

294 """Look up the label in a butler repository index. 

295 

296 Parameters 

297 ---------- 

298 label : `str` 

299 Label of the Butler repository to look up. 

300 return_label : `bool`, optional 

301 If ``label`` cannot be found in the repository index (either 

302 because index is not defined or ``label`` is not in the index) and 

303 ``return_label`` is `True` then return ``ResourcePath(label)``. 

304 If ``return_label`` is `False` (default) then an exception will be 

305 raised instead. 

306 

307 Returns 

308 ------- 

309 uri : `lsst.resources.ResourcePath` 

310 URI to the Butler repository associated with the given label or 

311 default value if it is provided. 

312 

313 Raises 

314 ------ 

315 KeyError 

316 Raised if the label is not found in the index, or if an index 

317 is not defined, and ``return_label`` is `False`. 

318 

319 Notes 

320 ----- 

321 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

322 information is discovered. 

323 """ 

324 return ButlerRepoIndex.get_repo_uri(label, return_label) 

325 

326 @classmethod 

327 def get_known_repos(cls) -> Set[str]: 

328 """Retrieve the list of known repository labels. 

329 

330 Returns 

331 ------- 

332 repos : `set` of `str` 

333 All the known labels. Can be empty if no index can be found. 

334 

335 Notes 

336 ----- 

337 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

338 information is discovered. 

339 """ 

340 return ButlerRepoIndex.get_known_repos() 

341 

342 @staticmethod 

343 def makeRepo( 

344 root: ResourcePathExpression, 

345 config: Union[Config, str, None] = None, 

346 dimensionConfig: Union[Config, str, None] = None, 

347 standalone: bool = False, 

348 searchPaths: Optional[List[str]] = None, 

349 forceConfigRoot: bool = True, 

350 outfile: Optional[ResourcePathExpression] = None, 

351 overwrite: bool = False, 

352 ) -> Config: 

353 """Create an empty data repository by adding a butler.yaml config 

354 to a repository root directory. 

355 

356 Parameters 

357 ---------- 

358 root : `lsst.resources.ResourcePathExpression` 

359 Path or URI to the root location of the new repository. Will be 

360 created if it does not exist. 

361 config : `Config` or `str`, optional 

362 Configuration to write to the repository, after setting any 

363 root-dependent Registry or Datastore config options. Can not 

364 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

365 configuration will be used. Root-dependent config options 

366 specified in this config are overwritten if ``forceConfigRoot`` 

367 is `True`. 

368 dimensionConfig : `Config` or `str`, optional 

369 Configuration for dimensions, will be used to initialize registry 

370 database. 

371 standalone : `bool` 

372 If True, write all expanded defaults, not just customized or 

373 repository-specific settings. 

374 This (mostly) decouples the repository from the default 

375 configuration, insulating it from changes to the defaults (which 

376 may be good or bad, depending on the nature of the changes). 

377 Future *additions* to the defaults will still be picked up when 

378 initializing `Butlers` to repos created with ``standalone=True``. 

379 searchPaths : `list` of `str`, optional 

380 Directory paths to search when calculating the full butler 

381 configuration. 

382 forceConfigRoot : `bool`, optional 

383 If `False`, any values present in the supplied ``config`` that 

384 would normally be reset are not overridden and will appear 

385 directly in the output config. This allows non-standard overrides 

386 of the root directory for a datastore or registry to be given. 

387 If this parameter is `True` the values for ``root`` will be 

388 forced into the resulting config if appropriate. 

389 outfile : `lss.resources.ResourcePathExpression`, optional 

390 If not-`None`, the output configuration will be written to this 

391 location rather than into the repository itself. Can be a URI 

392 string. Can refer to a directory that will be used to write 

393 ``butler.yaml``. 

394 overwrite : `bool`, optional 

395 Create a new configuration file even if one already exists 

396 in the specified output location. Default is to raise 

397 an exception. 

398 

399 Returns 

400 ------- 

401 config : `Config` 

402 The updated `Config` instance written to the repo. 

403 

404 Raises 

405 ------ 

406 ValueError 

407 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

408 regular Config (as these subclasses would make it impossible to 

409 support ``standalone=False``). 

410 FileExistsError 

411 Raised if the output config file already exists. 

412 os.error 

413 Raised if the directory does not exist, exists but is not a 

414 directory, or cannot be created. 

415 

416 Notes 

417 ----- 

418 Note that when ``standalone=False`` (the default), the configuration 

419 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

420 construct the repository should also be used to construct any Butlers 

421 to avoid configuration inconsistencies. 

422 """ 

423 if isinstance(config, (ButlerConfig, ConfigSubset)): 

424 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

425 

426 # Ensure that the root of the repository exists or can be made 

427 root_uri = ResourcePath(root, forceDirectory=True) 

428 root_uri.mkdir() 

429 

430 config = Config(config) 

431 

432 # If we are creating a new repo from scratch with relative roots, 

433 # do not propagate an explicit root from the config file 

434 if "root" in config: 

435 del config["root"] 

436 

437 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

438 imported_class = doImportType(full["datastore", "cls"]) 

439 if not issubclass(imported_class, Datastore): 

440 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

441 datastoreClass: Type[Datastore] = imported_class 

442 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

443 

444 # if key exists in given config, parse it, otherwise parse the defaults 

445 # in the expanded config 

446 if config.get(("registry", "db")): 

447 registryConfig = RegistryConfig(config) 

448 else: 

449 registryConfig = RegistryConfig(full) 

450 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

451 if defaultDatabaseUri is not None: 

452 Config.updateParameters( 

453 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

454 ) 

455 else: 

456 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

457 

458 if standalone: 

459 config.merge(full) 

460 else: 

461 # Always expand the registry.managers section into the per-repo 

462 # config, because after the database schema is created, it's not 

463 # allowed to change anymore. Note that in the standalone=True 

464 # branch, _everything_ in the config is expanded, so there's no 

465 # need to special case this. 

466 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

467 configURI: ResourcePathExpression 

468 if outfile is not None: 

469 # When writing to a separate location we must include 

470 # the root of the butler repo in the config else it won't know 

471 # where to look. 

472 config["root"] = root_uri.geturl() 

473 configURI = outfile 

474 else: 

475 configURI = root_uri 

476 # Strip obscore configuration, if it is present, before writing config 

477 # to a file, obscore config will be stored in registry. 

478 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

479 config_to_write = config.copy() 

480 del config_to_write[obscore_config_key] 

481 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

482 # configFile attribute is updated, need to copy it to original. 

483 config.configFile = config_to_write.configFile 

484 else: 

485 config.dumpToUri(configURI, overwrite=overwrite) 

486 

487 # Create Registry and populate tables 

488 registryConfig = RegistryConfig(config.get("registry")) 

489 dimensionConfig = DimensionConfig(dimensionConfig) 

490 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

491 

492 log.verbose("Wrote new Butler configuration file to %s", configURI) 

493 

494 return config 

495 

496 @classmethod 

497 def _unpickle( 

498 cls, 

499 config: ButlerConfig, 

500 collections: Optional[tuple[str, ...]], 

501 run: Optional[str], 

502 defaultDataId: Dict[str, str], 

503 writeable: bool, 

504 ) -> Butler: 

505 """Callable used to unpickle a Butler. 

506 

507 We prefer not to use ``Butler.__init__`` directly so we can force some 

508 of its many arguments to be keyword-only (note that ``__reduce__`` 

509 can only invoke callables with positional arguments). 

510 

511 Parameters 

512 ---------- 

513 config : `ButlerConfig` 

514 Butler configuration, already coerced into a true `ButlerConfig` 

515 instance (and hence after any search paths for overrides have been 

516 utilized). 

517 collections : `tuple` [ `str` ] 

518 Names of the default collections to read from. 

519 run : `str`, optional 

520 Name of the default `~CollectionType.RUN` collection to write to. 

521 defaultDataId : `dict` [ `str`, `str` ] 

522 Default data ID values. 

523 writeable : `bool` 

524 Whether the Butler should support write operations. 

525 

526 Returns 

527 ------- 

528 butler : `Butler` 

529 A new `Butler` instance. 

530 """ 

531 # MyPy doesn't recognize that the kwargs below are totally valid; it 

532 # seems to think '**defaultDataId* is a _positional_ argument! 

533 return cls( 

534 config=config, 

535 collections=collections, 

536 run=run, 

537 writeable=writeable, 

538 **defaultDataId, # type: ignore 

539 ) 

540 

541 def __reduce__(self) -> tuple: 

542 """Support pickling.""" 

543 return ( 

544 Butler._unpickle, 

545 ( 

546 self._config, 

547 self.collections, 

548 self.run, 

549 self.registry.defaults.dataId.byName(), 

550 self.registry.isWriteable(), 

551 ), 

552 ) 

553 

554 def __str__(self) -> str: 

555 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

556 self.collections, self.run, self.datastore, self.registry 

557 ) 

558 

559 def isWriteable(self) -> bool: 

560 """Return `True` if this `Butler` supports write operations.""" 

561 return self.registry.isWriteable() 

562 

563 @contextlib.contextmanager 

564 def transaction(self) -> Iterator[None]: 

565 """Context manager supporting `Butler` transactions. 

566 

567 Transactions can be nested. 

568 """ 

569 with self.registry.transaction(): 

570 with self.datastore.transaction(): 

571 yield 

572 

573 def _standardizeArgs( 

574 self, 

575 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

576 dataId: Optional[DataId] = None, 

577 for_put: bool = True, 

578 **kwargs: Any, 

579 ) -> Tuple[DatasetType, Optional[DataId]]: 

580 """Standardize the arguments passed to several Butler APIs. 

581 

582 Parameters 

583 ---------- 

584 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

585 When `DatasetRef` the `dataId` should be `None`. 

586 Otherwise the `DatasetType` or name thereof. 

587 dataId : `dict` or `DataCoordinate` 

588 A `dict` of `Dimension` link name, value pairs that label the 

589 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

590 should be provided as the second argument. 

591 for_put : `bool`, optional 

592 If `True` this call is invoked as part of a `Butler.put()`. 

593 Otherwise it is assumed to be part of a `Butler.get()`. This 

594 parameter is only relevant if there is dataset type 

595 inconsistency. 

596 **kwargs 

597 Additional keyword arguments used to augment or construct a 

598 `DataCoordinate`. See `DataCoordinate.standardize` 

599 parameters. 

600 

601 Returns 

602 ------- 

603 datasetType : `DatasetType` 

604 A `DatasetType` instance extracted from ``datasetRefOrType``. 

605 dataId : `dict` or `DataId`, optional 

606 Argument that can be used (along with ``kwargs``) to construct a 

607 `DataId`. 

608 

609 Notes 

610 ----- 

611 Butler APIs that conceptually need a DatasetRef also allow passing a 

612 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

613 keyword arguments that can be used to construct one) separately. This 

614 method accepts those arguments and always returns a true `DatasetType` 

615 and a `DataId` or `dict`. 

616 

617 Standardization of `dict` vs `DataId` is best handled by passing the 

618 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

619 generally similarly flexible. 

620 """ 

621 externalDatasetType: Optional[DatasetType] = None 

622 internalDatasetType: Optional[DatasetType] = None 

623 if isinstance(datasetRefOrType, DatasetRef): 

624 if dataId is not None or kwargs: 

625 raise ValueError("DatasetRef given, cannot use dataId as well") 

626 externalDatasetType = datasetRefOrType.datasetType 

627 dataId = datasetRefOrType.dataId 

628 else: 

629 # Don't check whether DataId is provided, because Registry APIs 

630 # can usually construct a better error message when it wasn't. 

631 if isinstance(datasetRefOrType, DatasetType): 

632 externalDatasetType = datasetRefOrType 

633 else: 

634 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

635 

636 # Check that they are self-consistent 

637 if externalDatasetType is not None: 

638 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

639 if externalDatasetType != internalDatasetType: 

640 # We can allow differences if they are compatible, depending 

641 # on whether this is a get or a put. A get requires that 

642 # the python type associated with the datastore can be 

643 # converted to the user type. A put requires that the user 

644 # supplied python type can be converted to the internal 

645 # type expected by registry. 

646 relevantDatasetType = internalDatasetType 

647 if for_put: 

648 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

649 else: 

650 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

651 relevantDatasetType = externalDatasetType 

652 if not is_compatible: 

653 raise ValueError( 

654 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

655 f"registry definition ({internalDatasetType})" 

656 ) 

657 # Override the internal definition. 

658 internalDatasetType = relevantDatasetType 

659 

660 assert internalDatasetType is not None 

661 return internalDatasetType, dataId 

662 

663 def _rewrite_data_id( 

664 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

665 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

666 """Rewrite a data ID taking into account dimension records. 

667 

668 Take a Data ID and keyword args and rewrite it if necessary to 

669 allow the user to specify dimension records rather than dimension 

670 primary values. 

671 

672 This allows a user to include a dataId dict with keys of 

673 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

674 the integer exposure ID. It also allows a string to be given 

675 for a dimension value rather than the integer ID if that is more 

676 convenient. For example, rather than having to specifyin the 

677 detector with ``detector.full_name``, a string given for ``detector`` 

678 will be interpreted as the full name and converted to the integer 

679 value. 

680 

681 Keyword arguments can also use strings for dimensions like detector 

682 and exposure but python does not allow them to include ``.`` and 

683 so the ``exposure.day_obs`` syntax can not be used in a keyword 

684 argument. 

685 

686 Parameters 

687 ---------- 

688 dataId : `dict` or `DataCoordinate` 

689 A `dict` of `Dimension` link name, value pairs that will label the 

690 `DatasetRef` within a Collection. 

691 datasetType : `DatasetType` 

692 The dataset type associated with this dataId. Required to 

693 determine the relevant dimensions. 

694 **kwargs 

695 Additional keyword arguments used to augment or construct a 

696 `DataId`. See `DataId` parameters. 

697 

698 Returns 

699 ------- 

700 dataId : `dict` or `DataCoordinate` 

701 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

702 no keyword arguments, the original dataId will be returned 

703 unchanged. 

704 **kwargs : `dict` 

705 Any unused keyword arguments (would normally be empty dict). 

706 """ 

707 # Do nothing if we have a standalone DataCoordinate. 

708 if isinstance(dataId, DataCoordinate) and not kwargs: 

709 return dataId, kwargs 

710 

711 # Process dimension records that are using record information 

712 # rather than ids 

713 newDataId: Dict[str, DataIdValue] = {} 

714 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

715 

716 # if all the dataId comes from keyword parameters we do not need 

717 # to do anything here because they can't be of the form 

718 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

719 if dataId: 

720 for k, v in dataId.items(): 

721 # If we have a Dimension we do not need to do anything 

722 # because it cannot be a compound key. 

723 if isinstance(k, str) and "." in k: 

724 # Someone is using a more human-readable dataId 

725 dimensionName, record = k.split(".", 1) 

726 byRecord[dimensionName][record] = v 

727 elif isinstance(k, Dimension): 

728 newDataId[k.name] = v 

729 else: 

730 newDataId[k] = v 

731 

732 # Go through the updated dataId and check the type in case someone is 

733 # using an alternate key. We have already filtered out the compound 

734 # keys dimensions.record format. 

735 not_dimensions = {} 

736 

737 # Will need to look in the dataId and the keyword arguments 

738 # and will remove them if they need to be fixed or are unrecognized. 

739 for dataIdDict in (newDataId, kwargs): 

740 # Use a list so we can adjust the dict safely in the loop 

741 for dimensionName in list(dataIdDict): 

742 value = dataIdDict[dimensionName] 

743 try: 

744 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

745 except KeyError: 

746 # This is not a real dimension 

747 not_dimensions[dimensionName] = value 

748 del dataIdDict[dimensionName] 

749 continue 

750 

751 # Convert an integral type to an explicit int to simplify 

752 # comparisons here 

753 if isinstance(value, numbers.Integral): 

754 value = int(value) 

755 

756 if not isinstance(value, dimension.primaryKey.getPythonType()): 

757 for alternate in dimension.alternateKeys: 

758 if isinstance(value, alternate.getPythonType()): 

759 byRecord[dimensionName][alternate.name] = value 

760 del dataIdDict[dimensionName] 

761 log.debug( 

762 "Converting dimension %s to %s.%s=%s", 

763 dimensionName, 

764 dimensionName, 

765 alternate.name, 

766 value, 

767 ) 

768 break 

769 else: 

770 log.warning( 

771 "Type mismatch found for value '%r' provided for dimension %s. " 

772 "Could not find matching alternative (primary key has type %s) " 

773 "so attempting to use as-is.", 

774 value, 

775 dimensionName, 

776 dimension.primaryKey.getPythonType(), 

777 ) 

778 

779 # By this point kwargs and newDataId should only include valid 

780 # dimensions. Merge kwargs in to the new dataId and log if there 

781 # are dimensions in both (rather than calling update). 

782 for k, v in kwargs.items(): 

783 if k in newDataId and newDataId[k] != v: 

784 log.debug( 

785 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

786 ) 

787 newDataId[k] = v 

788 # No need to retain any values in kwargs now. 

789 kwargs = {} 

790 

791 # If we have some unrecognized dimensions we have to try to connect 

792 # them to records in other dimensions. This is made more complicated 

793 # by some dimensions having records with clashing names. A mitigation 

794 # is that we can tell by this point which dimensions are missing 

795 # for the DatasetType but this does not work for calibrations 

796 # where additional dimensions can be used to constrain the temporal 

797 # axis. 

798 if not_dimensions: 

799 # Search for all dimensions even if we have been given a value 

800 # explicitly. In some cases records are given as well as the 

801 # actually dimension and this should not be an error if they 

802 # match. 

803 mandatoryDimensions = datasetType.dimensions.names # - provided 

804 

805 candidateDimensions: Set[str] = set() 

806 candidateDimensions.update(mandatoryDimensions) 

807 

808 # For calibrations we may well be needing temporal dimensions 

809 # so rather than always including all dimensions in the scan 

810 # restrict things a little. It is still possible for there 

811 # to be confusion over day_obs in visit vs exposure for example. 

812 # If we are not searching calibration collections things may 

813 # fail but they are going to fail anyway because of the 

814 # ambiguousness of the dataId... 

815 if datasetType.isCalibration(): 

816 for dim in self.registry.dimensions.getStaticDimensions(): 

817 if dim.temporal: 

818 candidateDimensions.add(str(dim)) 

819 

820 # Look up table for the first association with a dimension 

821 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

822 

823 # Keep track of whether an item is associated with multiple 

824 # dimensions. 

825 counter: Counter[str] = Counter() 

826 assigned: Dict[str, Set[str]] = defaultdict(set) 

827 

828 # Go through the missing dimensions and associate the 

829 # given names with records within those dimensions 

830 matched_dims = set() 

831 for dimensionName in candidateDimensions: 

832 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

833 fields = dimension.metadata.names | dimension.uniqueKeys.names 

834 for field in not_dimensions: 

835 if field in fields: 

836 guessedAssociation[dimensionName][field] = not_dimensions[field] 

837 counter[dimensionName] += 1 

838 assigned[field].add(dimensionName) 

839 matched_dims.add(field) 

840 

841 # Calculate the fields that matched nothing. 

842 never_found = set(not_dimensions) - matched_dims 

843 

844 if never_found: 

845 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

846 

847 # There is a chance we have allocated a single dataId item 

848 # to multiple dimensions. Need to decide which should be retained. 

849 # For now assume that the most popular alternative wins. 

850 # This means that day_obs with seq_num will result in 

851 # exposure.day_obs and not visit.day_obs 

852 # Also prefer an explicitly missing dimension over an inferred 

853 # temporal dimension. 

854 for fieldName, assignedDimensions in assigned.items(): 

855 if len(assignedDimensions) > 1: 

856 # Pick the most popular (preferring mandatory dimensions) 

857 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

858 if requiredButMissing: 

859 candidateDimensions = requiredButMissing 

860 else: 

861 candidateDimensions = assignedDimensions 

862 

863 # If this is a choice between visit and exposure and 

864 # neither was a required part of the dataset type, 

865 # (hence in this branch) always prefer exposure over 

866 # visit since exposures are always defined and visits 

867 # are defined from exposures. 

868 if candidateDimensions == {"exposure", "visit"}: 

869 candidateDimensions = {"exposure"} 

870 

871 # Select the relevant items and get a new restricted 

872 # counter. 

873 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

874 duplicatesCounter: Counter[str] = Counter() 

875 duplicatesCounter.update(theseCounts) 

876 

877 # Choose the most common. If they are equally common 

878 # we will pick the one that was found first. 

879 # Returns a list of tuples 

880 selected = duplicatesCounter.most_common(1)[0][0] 

881 

882 log.debug( 

883 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

884 " Removed ambiguity by choosing dimension %s.", 

885 fieldName, 

886 ", ".join(assignedDimensions), 

887 selected, 

888 ) 

889 

890 for candidateDimension in assignedDimensions: 

891 if candidateDimension != selected: 

892 del guessedAssociation[candidateDimension][fieldName] 

893 

894 # Update the record look up dict with the new associations 

895 for dimensionName, values in guessedAssociation.items(): 

896 if values: # A dict might now be empty 

897 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

898 byRecord[dimensionName].update(values) 

899 

900 if byRecord: 

901 # Some record specifiers were found so we need to convert 

902 # them to the Id form 

903 for dimensionName, values in byRecord.items(): 

904 if dimensionName in newDataId: 

905 log.debug( 

906 "DataId specified explicit %s dimension value of %s in addition to" 

907 " general record specifiers for it of %s. Ignoring record information.", 

908 dimensionName, 

909 newDataId[dimensionName], 

910 str(values), 

911 ) 

912 # Get the actual record and compare with these values. 

913 try: 

914 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

915 except DataIdError: 

916 raise ValueError( 

917 f"Could not find dimension '{dimensionName}'" 

918 f" with dataId {newDataId} as part of comparing with" 

919 f" record values {byRecord[dimensionName]}" 

920 ) from None 

921 if len(recs) == 1: 

922 errmsg: List[str] = [] 

923 for k, v in values.items(): 

924 if (recval := getattr(recs[0], k)) != v: 

925 errmsg.append(f"{k}({recval} != {v})") 

926 if errmsg: 

927 raise ValueError( 

928 f"Dimension {dimensionName} in dataId has explicit value" 

929 " inconsistent with records: " + ", ".join(errmsg) 

930 ) 

931 else: 

932 # Multiple matches for an explicit dimension 

933 # should never happen but let downstream complain. 

934 pass 

935 continue 

936 

937 # Build up a WHERE expression 

938 bind = {k: v for k, v in values.items()} 

939 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

940 

941 # Hopefully we get a single record that matches 

942 records = set( 

943 self.registry.queryDimensionRecords( 

944 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

945 ) 

946 ) 

947 

948 if len(records) != 1: 

949 if len(records) > 1: 

950 # visit can have an ambiguous answer without involving 

951 # visit_system. The default visit_system is defined 

952 # by the instrument. 

953 if ( 

954 dimensionName == "visit" 

955 and "visit_system_membership" in self.registry.dimensions 

956 and "visit_system" in self.registry.dimensions["instrument"].metadata 

957 ): 

958 instrument_records = list( 

959 self.registry.queryDimensionRecords( 

960 "instrument", 

961 dataId=newDataId, 

962 **kwargs, 

963 ) 

964 ) 

965 if len(instrument_records) == 1: 

966 visit_system = instrument_records[0].visit_system 

967 if visit_system is None: 

968 # Set to a value that will never match. 

969 visit_system = -1 

970 

971 # Look up each visit in the 

972 # visit_system_membership records. 

973 for rec in records: 

974 membership = list( 

975 self.registry.queryDimensionRecords( 

976 # Use bind to allow zero results. 

977 # This is a fully-specified query. 

978 "visit_system_membership", 

979 where="instrument = inst AND visit_system = system AND visit = v", 

980 bind=dict( 

981 inst=instrument_records[0].name, system=visit_system, v=rec.id 

982 ), 

983 ) 

984 ) 

985 if membership: 

986 # This record is the right answer. 

987 records = set([rec]) 

988 break 

989 

990 # The ambiguity may have been resolved so check again. 

991 if len(records) > 1: 

992 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

993 for r in records: 

994 log.debug("- %s", str(r)) 

995 raise ValueError( 

996 f"DataId specification for dimension {dimensionName} is not" 

997 f" uniquely constrained to a single dataset by {values}." 

998 f" Got {len(records)} results." 

999 ) 

1000 else: 

1001 raise ValueError( 

1002 f"DataId specification for dimension {dimensionName} matched no" 

1003 f" records when constrained by {values}" 

1004 ) 

1005 

1006 # Get the primary key from the real dimension object 

1007 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1008 if not isinstance(dimension, Dimension): 

1009 raise RuntimeError( 

1010 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1011 ) 

1012 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1013 

1014 return newDataId, kwargs 

1015 

1016 def _findDatasetRef( 

1017 self, 

1018 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1019 dataId: Optional[DataId] = None, 

1020 *, 

1021 collections: Any = None, 

1022 predict: bool = False, 

1023 run: str | None = None, 

1024 **kwargs: Any, 

1025 ) -> DatasetRef: 

1026 """Shared logic for methods that start with a search for a dataset in 

1027 the registry. 

1028 

1029 Parameters 

1030 ---------- 

1031 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1032 When `DatasetRef` the `dataId` should be `None`. 

1033 Otherwise the `DatasetType` or name thereof. 

1034 dataId : `dict` or `DataCoordinate`, optional 

1035 A `dict` of `Dimension` link name, value pairs that label the 

1036 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1037 should be provided as the first argument. 

1038 collections : Any, optional 

1039 Collections to be searched, overriding ``self.collections``. 

1040 Can be any of the types supported by the ``collections`` argument 

1041 to butler construction. 

1042 predict : `bool`, optional 

1043 If `True`, return a newly created `DatasetRef` with a unique 

1044 dataset ID if finding a reference in the `Registry` fails. 

1045 Defaults to `False`. 

1046 run : `str`, optional 

1047 Run collection name to use for creating `DatasetRef` for predicted 

1048 datasets. Only used if ``predict`` is `True`. 

1049 **kwargs 

1050 Additional keyword arguments used to augment or construct a 

1051 `DataId`. See `DataId` parameters. 

1052 

1053 Returns 

1054 ------- 

1055 ref : `DatasetRef` 

1056 A reference to the dataset identified by the given arguments. 

1057 This can be the same dataset reference as given if it was 

1058 resolved. 

1059 

1060 Raises 

1061 ------ 

1062 LookupError 

1063 Raised if no matching dataset exists in the `Registry` (and 

1064 ``predict`` is `False`). 

1065 ValueError 

1066 Raised if a resolved `DatasetRef` was passed as an input, but it 

1067 differs from the one found in the registry. 

1068 TypeError 

1069 Raised if no collections were provided. 

1070 """ 

1071 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1072 if isinstance(datasetRefOrType, DatasetRef): 

1073 if collections is not None: 

1074 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

1075 return datasetRefOrType 

1076 timespan: Optional[Timespan] = None 

1077 

1078 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1079 

1080 if datasetType.isCalibration(): 

1081 # Because this is a calibration dataset, first try to make a 

1082 # standardize the data ID without restricting the dimensions to 

1083 # those of the dataset type requested, because there may be extra 

1084 # dimensions that provide temporal information for a validity-range 

1085 # lookup. 

1086 dataId = DataCoordinate.standardize( 

1087 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1088 ) 

1089 if dataId.graph.temporal: 

1090 dataId = self.registry.expandDataId(dataId) 

1091 timespan = dataId.timespan 

1092 else: 

1093 # Standardize the data ID to just the dimensions of the dataset 

1094 # type instead of letting registry.findDataset do it, so we get the 

1095 # result even if no dataset is found. 

1096 dataId = DataCoordinate.standardize( 

1097 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1098 ) 

1099 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1100 # present in the current collection. 

1101 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1102 if ref is None: 

1103 if predict: 

1104 if run is None: 

1105 run = self.run 

1106 if run is None: 

1107 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1108 return DatasetRef(datasetType, dataId, run=run) 

1109 else: 

1110 if collections is None: 

1111 collections = self.registry.defaults.collections 

1112 raise LookupError( 

1113 f"Dataset {datasetType.name} with data ID {dataId} " 

1114 f"could not be found in collections {collections}." 

1115 ) 

1116 if datasetType != ref.datasetType: 

1117 # If they differ it is because the user explicitly specified 

1118 # a compatible dataset type to this call rather than using the 

1119 # registry definition. The DatasetRef must therefore be recreated 

1120 # using the user definition such that the expected type is 

1121 # returned. 

1122 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1123 

1124 return ref 

1125 

1126 @transactional 

1127 @deprecated( 

1128 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1129 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1130 " were relying on the run parameter to determine the run." 

1131 " Will be removed after v27.0.", 

1132 version="v26.0", 

1133 category=FutureWarning, 

1134 ) 

1135 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1136 # Docstring inherited. 

1137 return self.put(obj, ref) 

1138 

1139 @transactional 

1140 def put( 

1141 self, 

1142 obj: Any, 

1143 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1144 /, 

1145 dataId: Optional[DataId] = None, 

1146 *, 

1147 run: Optional[str] = None, 

1148 **kwargs: Any, 

1149 ) -> DatasetRef: 

1150 """Store and register a dataset. 

1151 

1152 Parameters 

1153 ---------- 

1154 obj : `object` 

1155 The dataset. 

1156 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1157 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1158 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1159 `DatasetRef` is given the run and ID are used directly. 

1160 dataId : `dict` or `DataCoordinate` 

1161 A `dict` of `Dimension` link name, value pairs that label the 

1162 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1163 should be provided as the second argument. 

1164 run : `str`, optional 

1165 The name of the run the dataset should be added to, overriding 

1166 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1167 **kwargs 

1168 Additional keyword arguments used to augment or construct a 

1169 `DataCoordinate`. See `DataCoordinate.standardize` 

1170 parameters. Not used if a resolve `DatasetRef` is provided. 

1171 

1172 Returns 

1173 ------- 

1174 ref : `DatasetRef` 

1175 A reference to the stored dataset, updated with the correct id if 

1176 given. 

1177 

1178 Raises 

1179 ------ 

1180 TypeError 

1181 Raised if the butler is read-only or if no run has been provided. 

1182 """ 

1183 if isinstance(datasetRefOrType, DatasetRef): 

1184 # This is a direct put of predefined DatasetRef. 

1185 log.debug("Butler put direct: %s", datasetRefOrType) 

1186 if run is not None: 

1187 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

1188 # If registry already has a dataset with the same dataset ID, 

1189 # dataset type and DataId, then _importDatasets will do nothing and 

1190 # just return an original ref. We have to raise in this case, there 

1191 # is a datastore check below for that. 

1192 self.registry._importDatasets([datasetRefOrType], expand=True) 

1193 # Before trying to write to the datastore check that it does not 

1194 # know this dataset. This is prone to races, of course. 

1195 if self.datastore.knows(datasetRefOrType): 

1196 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1197 # Try to write dataset to the datastore, if it fails due to a race 

1198 # with another write, the content of stored data may be 

1199 # unpredictable. 

1200 try: 

1201 self.datastore.put(obj, datasetRefOrType) 

1202 except IntegrityError as e: 

1203 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") 

1204 return datasetRefOrType 

1205 

1206 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1207 if not self.isWriteable(): 

1208 raise TypeError("Butler is read-only.") 

1209 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1210 

1211 # Handle dimension records in dataId 

1212 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1213 

1214 # Add Registry Dataset entry. 

1215 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1216 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1217 self.datastore.put(obj, ref) 

1218 

1219 return ref 

1220 

1221 @deprecated( 

1222 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1223 " Please use Butler.get(). Will be removed after v27.0.", 

1224 version="v26.0", 

1225 category=FutureWarning, 

1226 ) 

1227 def getDirect( 

1228 self, 

1229 ref: DatasetRef, 

1230 *, 

1231 parameters: Optional[Dict[str, Any]] = None, 

1232 storageClass: Optional[Union[StorageClass, str]] = None, 

1233 ) -> Any: 

1234 """Retrieve a stored dataset. 

1235 

1236 Parameters 

1237 ---------- 

1238 ref : `DatasetRef` 

1239 Resolved reference to an already stored dataset. 

1240 parameters : `dict` 

1241 Additional StorageClass-defined options to control reading, 

1242 typically used to efficiently read only a subset of the dataset. 

1243 storageClass : `StorageClass` or `str`, optional 

1244 The storage class to be used to override the Python type 

1245 returned by this method. By default the returned type matches 

1246 the dataset type definition for this dataset. Specifying a 

1247 read `StorageClass` can force a different type to be returned. 

1248 This type must be compatible with the original type. 

1249 

1250 Returns 

1251 ------- 

1252 obj : `object` 

1253 The dataset. 

1254 """ 

1255 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1256 

1257 @deprecated( 

1258 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1259 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1260 version="v26.0", 

1261 category=FutureWarning, 

1262 ) 

1263 def getDirectDeferred( 

1264 self, 

1265 ref: DatasetRef, 

1266 *, 

1267 parameters: Union[dict, None] = None, 

1268 storageClass: str | StorageClass | None = None, 

1269 ) -> DeferredDatasetHandle: 

1270 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1271 from a resolved `DatasetRef`. 

1272 

1273 Parameters 

1274 ---------- 

1275 ref : `DatasetRef` 

1276 Resolved reference to an already stored dataset. 

1277 parameters : `dict` 

1278 Additional StorageClass-defined options to control reading, 

1279 typically used to efficiently read only a subset of the dataset. 

1280 storageClass : `StorageClass` or `str`, optional 

1281 The storage class to be used to override the Python type 

1282 returned by this method. By default the returned type matches 

1283 the dataset type definition for this dataset. Specifying a 

1284 read `StorageClass` can force a different type to be returned. 

1285 This type must be compatible with the original type. 

1286 

1287 Returns 

1288 ------- 

1289 obj : `DeferredDatasetHandle` 

1290 A handle which can be used to retrieve a dataset at a later time. 

1291 

1292 Raises 

1293 ------ 

1294 LookupError 

1295 Raised if no matching dataset exists in the `Registry`. 

1296 """ 

1297 # Check thad dataset actuall exists. 

1298 if not self.datastore.exists(ref): 

1299 raise LookupError(f"Dataset reference {ref} does not exist.") 

1300 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1301 

1302 def getDeferred( 

1303 self, 

1304 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1305 /, 

1306 dataId: Optional[DataId] = None, 

1307 *, 

1308 parameters: Union[dict, None] = None, 

1309 collections: Any = None, 

1310 storageClass: str | StorageClass | None = None, 

1311 **kwargs: Any, 

1312 ) -> DeferredDatasetHandle: 

1313 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1314 after an immediate registry lookup. 

1315 

1316 Parameters 

1317 ---------- 

1318 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1319 When `DatasetRef` the `dataId` should be `None`. 

1320 Otherwise the `DatasetType` or name thereof. 

1321 dataId : `dict` or `DataCoordinate`, optional 

1322 A `dict` of `Dimension` link name, value pairs that label the 

1323 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1324 should be provided as the first argument. 

1325 parameters : `dict` 

1326 Additional StorageClass-defined options to control reading, 

1327 typically used to efficiently read only a subset of the dataset. 

1328 collections : Any, optional 

1329 Collections to be searched, overriding ``self.collections``. 

1330 Can be any of the types supported by the ``collections`` argument 

1331 to butler construction. 

1332 storageClass : `StorageClass` or `str`, optional 

1333 The storage class to be used to override the Python type 

1334 returned by this method. By default the returned type matches 

1335 the dataset type definition for this dataset. Specifying a 

1336 read `StorageClass` can force a different type to be returned. 

1337 This type must be compatible with the original type. 

1338 **kwargs 

1339 Additional keyword arguments used to augment or construct a 

1340 `DataId`. See `DataId` parameters. 

1341 

1342 Returns 

1343 ------- 

1344 obj : `DeferredDatasetHandle` 

1345 A handle which can be used to retrieve a dataset at a later time. 

1346 

1347 Raises 

1348 ------ 

1349 LookupError 

1350 Raised if no matching dataset exists in the `Registry`. 

1351 ValueError 

1352 Raised if a resolved `DatasetRef` was passed as an input, but it 

1353 differs from the one found in the registry. 

1354 TypeError 

1355 Raised if no collections were provided. 

1356 """ 

1357 if isinstance(datasetRefOrType, DatasetRef) and not self.datastore.exists(datasetRefOrType): 

1358 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1359 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1360 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1361 

1362 def get( 

1363 self, 

1364 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1365 /, 

1366 dataId: Optional[DataId] = None, 

1367 *, 

1368 parameters: Optional[Dict[str, Any]] = None, 

1369 collections: Any = None, 

1370 storageClass: Optional[Union[StorageClass, str]] = None, 

1371 **kwargs: Any, 

1372 ) -> Any: 

1373 """Retrieve a stored dataset. 

1374 

1375 Parameters 

1376 ---------- 

1377 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1378 When `DatasetRef` the `dataId` should be `None`. 

1379 Otherwise the `DatasetType` or name thereof. 

1380 If a resolved `DatasetRef`, the associated dataset 

1381 is returned directly without additional querying. 

1382 dataId : `dict` or `DataCoordinate` 

1383 A `dict` of `Dimension` link name, value pairs that label the 

1384 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1385 should be provided as the first argument. 

1386 parameters : `dict` 

1387 Additional StorageClass-defined options to control reading, 

1388 typically used to efficiently read only a subset of the dataset. 

1389 collections : Any, optional 

1390 Collections to be searched, overriding ``self.collections``. 

1391 Can be any of the types supported by the ``collections`` argument 

1392 to butler construction. 

1393 storageClass : `StorageClass` or `str`, optional 

1394 The storage class to be used to override the Python type 

1395 returned by this method. By default the returned type matches 

1396 the dataset type definition for this dataset. Specifying a 

1397 read `StorageClass` can force a different type to be returned. 

1398 This type must be compatible with the original type. 

1399 **kwargs 

1400 Additional keyword arguments used to augment or construct a 

1401 `DataCoordinate`. See `DataCoordinate.standardize` 

1402 parameters. 

1403 

1404 Returns 

1405 ------- 

1406 obj : `object` 

1407 The dataset. 

1408 

1409 Raises 

1410 ------ 

1411 LookupError 

1412 Raised if no matching dataset exists in the `Registry`. 

1413 TypeError 

1414 Raised if no collections were provided. 

1415 

1416 Notes 

1417 ----- 

1418 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1419 this method requires that the given data ID include temporal dimensions 

1420 beyond the dimensions of the dataset type itself, in order to find the 

1421 dataset with the appropriate validity range. For example, a "bias" 

1422 dataset with native dimensions ``{instrument, detector}`` could be 

1423 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1424 ``exposure`` is a temporal dimension. 

1425 """ 

1426 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1427 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1428 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1429 

1430 def getURIs( 

1431 self, 

1432 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1433 /, 

1434 dataId: Optional[DataId] = None, 

1435 *, 

1436 predict: bool = False, 

1437 collections: Any = None, 

1438 run: Optional[str] = None, 

1439 **kwargs: Any, 

1440 ) -> DatasetRefURIs: 

1441 """Returns the URIs associated with the dataset. 

1442 

1443 Parameters 

1444 ---------- 

1445 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1446 When `DatasetRef` the `dataId` should be `None`. 

1447 Otherwise the `DatasetType` or name thereof. 

1448 dataId : `dict` or `DataCoordinate` 

1449 A `dict` of `Dimension` link name, value pairs that label the 

1450 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1451 should be provided as the first argument. 

1452 predict : `bool` 

1453 If `True`, allow URIs to be returned of datasets that have not 

1454 been written. 

1455 collections : Any, optional 

1456 Collections to be searched, overriding ``self.collections``. 

1457 Can be any of the types supported by the ``collections`` argument 

1458 to butler construction. 

1459 run : `str`, optional 

1460 Run to use for predictions, overriding ``self.run``. 

1461 **kwargs 

1462 Additional keyword arguments used to augment or construct a 

1463 `DataCoordinate`. See `DataCoordinate.standardize` 

1464 parameters. 

1465 

1466 Returns 

1467 ------- 

1468 uris : `DatasetRefURIs` 

1469 The URI to the primary artifact associated with this dataset (if 

1470 the dataset was disassembled within the datastore this may be 

1471 `None`), and the URIs to any components associated with the dataset 

1472 artifact. (can be empty if there are no components). 

1473 """ 

1474 ref = self._findDatasetRef( 

1475 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1476 ) 

1477 return self.datastore.getURIs(ref, predict) 

1478 

1479 def getURI( 

1480 self, 

1481 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1482 /, 

1483 dataId: Optional[DataId] = None, 

1484 *, 

1485 predict: bool = False, 

1486 collections: Any = None, 

1487 run: Optional[str] = None, 

1488 **kwargs: Any, 

1489 ) -> ResourcePath: 

1490 """Return the URI to the Dataset. 

1491 

1492 Parameters 

1493 ---------- 

1494 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1495 When `DatasetRef` the `dataId` should be `None`. 

1496 Otherwise the `DatasetType` or name thereof. 

1497 dataId : `dict` or `DataCoordinate` 

1498 A `dict` of `Dimension` link name, value pairs that label the 

1499 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1500 should be provided as the first argument. 

1501 predict : `bool` 

1502 If `True`, allow URIs to be returned of datasets that have not 

1503 been written. 

1504 collections : Any, optional 

1505 Collections to be searched, overriding ``self.collections``. 

1506 Can be any of the types supported by the ``collections`` argument 

1507 to butler construction. 

1508 run : `str`, optional 

1509 Run to use for predictions, overriding ``self.run``. 

1510 **kwargs 

1511 Additional keyword arguments used to augment or construct a 

1512 `DataCoordinate`. See `DataCoordinate.standardize` 

1513 parameters. 

1514 

1515 Returns 

1516 ------- 

1517 uri : `lsst.resources.ResourcePath` 

1518 URI pointing to the Dataset within the datastore. If the 

1519 Dataset does not exist in the datastore, and if ``predict`` is 

1520 `True`, the URI will be a prediction and will include a URI 

1521 fragment "#predicted". 

1522 If the datastore does not have entities that relate well 

1523 to the concept of a URI the returned URI string will be 

1524 descriptive. The returned URI is not guaranteed to be obtainable. 

1525 

1526 Raises 

1527 ------ 

1528 LookupError 

1529 A URI has been requested for a dataset that does not exist and 

1530 guessing is not allowed. 

1531 ValueError 

1532 Raised if a resolved `DatasetRef` was passed as an input, but it 

1533 differs from the one found in the registry. 

1534 TypeError 

1535 Raised if no collections were provided. 

1536 RuntimeError 

1537 Raised if a URI is requested for a dataset that consists of 

1538 multiple artifacts. 

1539 """ 

1540 primary, components = self.getURIs( 

1541 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1542 ) 

1543 

1544 if primary is None or components: 

1545 raise RuntimeError( 

1546 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1547 "Use Butler.getURIs() instead." 

1548 ) 

1549 return primary 

1550 

1551 def retrieveArtifacts( 

1552 self, 

1553 refs: Iterable[DatasetRef], 

1554 destination: ResourcePathExpression, 

1555 transfer: str = "auto", 

1556 preserve_path: bool = True, 

1557 overwrite: bool = False, 

1558 ) -> List[ResourcePath]: 

1559 """Retrieve the artifacts associated with the supplied refs. 

1560 

1561 Parameters 

1562 ---------- 

1563 refs : iterable of `DatasetRef` 

1564 The datasets for which artifacts are to be retrieved. 

1565 A single ref can result in multiple artifacts. The refs must 

1566 be resolved. 

1567 destination : `lsst.resources.ResourcePath` or `str` 

1568 Location to write the artifacts. 

1569 transfer : `str`, optional 

1570 Method to use to transfer the artifacts. Must be one of the options 

1571 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1572 "move" is not allowed. 

1573 preserve_path : `bool`, optional 

1574 If `True` the full path of the artifact within the datastore 

1575 is preserved. If `False` the final file component of the path 

1576 is used. 

1577 overwrite : `bool`, optional 

1578 If `True` allow transfers to overwrite existing files at the 

1579 destination. 

1580 

1581 Returns 

1582 ------- 

1583 targets : `list` of `lsst.resources.ResourcePath` 

1584 URIs of file artifacts in destination location. Order is not 

1585 preserved. 

1586 

1587 Notes 

1588 ----- 

1589 For non-file datastores the artifacts written to the destination 

1590 may not match the representation inside the datastore. For example 

1591 a hierarchical data structure in a NoSQL database may well be stored 

1592 as a JSON file. 

1593 """ 

1594 return self.datastore.retrieveArtifacts( 

1595 refs, 

1596 ResourcePath(destination), 

1597 transfer=transfer, 

1598 preserve_path=preserve_path, 

1599 overwrite=overwrite, 

1600 ) 

1601 

1602 def exists( 

1603 self, 

1604 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1605 /, 

1606 data_id: DataId | None = None, 

1607 *, 

1608 full_check: bool = True, 

1609 collections: Any = None, 

1610 **kwargs: Any, 

1611 ) -> DatasetExistence: 

1612 """Indicate whether a dataset is known to Butler registry and 

1613 datastore. 

1614 

1615 Parameters 

1616 ---------- 

1617 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str` 

1618 When `DatasetRef` the `dataId` should be `None`. 

1619 Otherwise the `DatasetType` or name thereof. 

1620 data_id : `dict` or `DataCoordinate` 

1621 A `dict` of `Dimension` link name, value pairs that label the 

1622 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1623 should be provided as the first argument. 

1624 full_check : `bool`, optional 

1625 If `True`, an additional check will be made for dataset artifact 

1626 existence. This will involve additional overhead due to the need 

1627 to query an external system. If `False` registry and datastore 

1628 will solely be asked if they know about the dataset but no 

1629 check for the artifact will be performed. 

1630 collections : Any, optional 

1631 Collections to be searched, overriding ``self.collections``. 

1632 Can be any of the types supported by the ``collections`` argument 

1633 to butler construction. 

1634 **kwargs 

1635 Additional keyword arguments used to augment or construct a 

1636 `DataCoordinate`. See `DataCoordinate.standardize` 

1637 parameters. 

1638 

1639 Returns 

1640 ------- 

1641 existence : `DatasetExistence` 

1642 Object indicating whether the dataset is known to registry and 

1643 datastore. Evaluates to `True` if the dataset is present and known 

1644 to both. 

1645 """ 

1646 existence = DatasetExistence.UNRECOGNIZED 

1647 

1648 if isinstance(dataset_ref_or_type, DatasetRef): 

1649 if collections is not None: 

1650 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1651 if data_id is not None: 

1652 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1653 ref = dataset_ref_or_type 

1654 registry_ref = self.registry.getDataset(dataset_ref_or_type.id) 

1655 if registry_ref is not None: 

1656 existence |= DatasetExistence.RECORDED 

1657 

1658 if dataset_ref_or_type != registry_ref: 

1659 # This could mean that storage classes differ, so we should 

1660 # check for that but use the registry ref for the rest of 

1661 # the method. 

1662 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1663 # Use the registry version from now on. 

1664 ref = registry_ref 

1665 else: 

1666 raise ValueError( 

1667 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1668 f"in registry but has different incompatible values ({registry_ref})." 

1669 ) 

1670 else: 

1671 try: 

1672 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1673 except (LookupError, TypeError, NoDefaultCollectionError): 

1674 return existence 

1675 existence |= DatasetExistence.RECORDED 

1676 

1677 if self.datastore.knows(ref): 

1678 existence |= DatasetExistence.DATASTORE 

1679 

1680 if full_check: 

1681 if self.datastore.exists(ref): 

1682 existence |= DatasetExistence._ARTIFACT 

1683 elif existence != DatasetExistence.UNRECOGNIZED: 

1684 # Do not add this flag if we have no other idea about a dataset. 

1685 existence |= DatasetExistence._ASSUMED 

1686 

1687 return existence 

1688 

1689 def _exists_many( 

1690 self, 

1691 refs: Iterable[DatasetRef], 

1692 /, 

1693 *, 

1694 full_check: bool = True, 

1695 ) -> dict[DatasetRef, DatasetExistence]: 

1696 """Indicate whether multiple datasets are known to Butler registry and 

1697 datastore. 

1698 

1699 This is an experimental API that may change at any moment. 

1700 

1701 Parameters 

1702 ---------- 

1703 refs : iterable of `DatasetRef` 

1704 The datasets to be checked. 

1705 full_check : `bool`, optional 

1706 If `True`, an additional check will be made for dataset artifact 

1707 existence. This will involve additional overhead due to the need 

1708 to query an external system. If `False` registry and datastore 

1709 will solely be asked if they know about the dataset but no 

1710 check for the artifact will be performed. 

1711 

1712 Returns 

1713 ------- 

1714 existence : dict of [`DatasetRef`, `DatasetExistence`] 

1715 Mapping from the given dataset refs to an enum indicating the 

1716 status of the dataset in registry and datastore. 

1717 Each value evaluates to `True` if the dataset is present and known 

1718 to both. 

1719 """ 

1720 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1721 

1722 # Registry does not have a bulk API to check for a ref. 

1723 for ref in refs: 

1724 registry_ref = self.registry.getDataset(ref.id) 

1725 if registry_ref is not None: 

1726 # It is possible, albeit unlikely, that the given ref does 

1727 # not match the one in registry even though the UUID matches. 

1728 # When checking a single ref we raise, but it's impolite to 

1729 # do that when potentially hundreds of refs are being checked. 

1730 # We could change the API to only accept UUIDs and that would 

1731 # remove the ability to even check and remove the worry 

1732 # about differing storage classes. Given the ongoing discussion 

1733 # on refs vs UUIDs and whether to raise or have a new 

1734 # private flag, treat this as a private API for now. 

1735 existence[ref] |= DatasetExistence.RECORDED 

1736 

1737 # Ask datastore if it knows about these refs. 

1738 knows = self.datastore.knows_these(refs) 

1739 for ref, known in knows.items(): 

1740 if known: 

1741 existence[ref] |= DatasetExistence.DATASTORE 

1742 

1743 if full_check: 

1744 mexists = self.datastore.mexists(refs) 

1745 for ref, exists in mexists.items(): 

1746 if exists: 

1747 existence[ref] |= DatasetExistence._ARTIFACT 

1748 else: 

1749 # Do not set this flag if nothing is known about the dataset. 

1750 for ref in existence.keys(): 

1751 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1752 existence[ref] |= DatasetExistence._ASSUMED 

1753 

1754 return existence 

1755 

1756 @deprecated( 

1757 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v27.0.", 

1758 version="v26.0", 

1759 category=FutureWarning, 

1760 ) 

1761 def datasetExists( 

1762 self, 

1763 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1764 dataId: Optional[DataId] = None, 

1765 *, 

1766 collections: Any = None, 

1767 **kwargs: Any, 

1768 ) -> bool: 

1769 """Return True if the Dataset is actually present in the Datastore. 

1770 

1771 Parameters 

1772 ---------- 

1773 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1774 When `DatasetRef` the `dataId` should be `None`. 

1775 Otherwise the `DatasetType` or name thereof. 

1776 dataId : `dict` or `DataCoordinate` 

1777 A `dict` of `Dimension` link name, value pairs that label the 

1778 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1779 should be provided as the first argument. 

1780 collections : Any, optional 

1781 Collections to be searched, overriding ``self.collections``. 

1782 Can be any of the types supported by the ``collections`` argument 

1783 to butler construction. 

1784 **kwargs 

1785 Additional keyword arguments used to augment or construct a 

1786 `DataCoordinate`. See `DataCoordinate.standardize` 

1787 parameters. 

1788 

1789 Raises 

1790 ------ 

1791 LookupError 

1792 Raised if the dataset is not even present in the Registry. 

1793 ValueError 

1794 Raised if a resolved `DatasetRef` was passed as an input, but it 

1795 differs from the one found in the registry. 

1796 NoDefaultCollectionError 

1797 Raised if no collections were provided. 

1798 """ 

1799 # A resolved ref may be given that is not known to this butler. 

1800 if isinstance(datasetRefOrType, DatasetRef): 

1801 ref = self.registry.getDataset(datasetRefOrType.id) 

1802 if ref is None: 

1803 raise LookupError( 

1804 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1805 ) 

1806 else: 

1807 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1808 return self.datastore.exists(ref) 

1809 

1810 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1811 """Remove one or more `~CollectionType.RUN` collections and the 

1812 datasets within them. 

1813 

1814 Parameters 

1815 ---------- 

1816 names : `Iterable` [ `str` ] 

1817 The names of the collections to remove. 

1818 unstore : `bool`, optional 

1819 If `True` (default), delete datasets from all datastores in which 

1820 they are present, and attempt to rollback the registry deletions if 

1821 datastore deletions fail (which may not always be possible). If 

1822 `False`, datastore records for these datasets are still removed, 

1823 but any artifacts (e.g. files) will not be. 

1824 

1825 Raises 

1826 ------ 

1827 TypeError 

1828 Raised if one or more collections are not of type 

1829 `~CollectionType.RUN`. 

1830 """ 

1831 if not self.isWriteable(): 

1832 raise TypeError("Butler is read-only.") 

1833 names = list(names) 

1834 refs: List[DatasetRef] = [] 

1835 for name in names: 

1836 collectionType = self.registry.getCollectionType(name) 

1837 if collectionType is not CollectionType.RUN: 

1838 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1839 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1840 with self.datastore.transaction(): 

1841 with self.registry.transaction(): 

1842 if unstore: 

1843 self.datastore.trash(refs) 

1844 else: 

1845 self.datastore.forget(refs) 

1846 for name in names: 

1847 self.registry.removeCollection(name) 

1848 if unstore: 

1849 # Point of no return for removing artifacts 

1850 self.datastore.emptyTrash() 

1851 

1852 def pruneDatasets( 

1853 self, 

1854 refs: Iterable[DatasetRef], 

1855 *, 

1856 disassociate: bool = True, 

1857 unstore: bool = False, 

1858 tags: Iterable[str] = (), 

1859 purge: bool = False, 

1860 ) -> None: 

1861 # docstring inherited from LimitedButler 

1862 

1863 if not self.isWriteable(): 

1864 raise TypeError("Butler is read-only.") 

1865 if purge: 

1866 if not disassociate: 

1867 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1868 if not unstore: 

1869 raise TypeError("Cannot pass purge=True without unstore=True.") 

1870 elif disassociate: 

1871 tags = tuple(tags) 

1872 if not tags: 

1873 raise TypeError("No tags provided but disassociate=True.") 

1874 for tag in tags: 

1875 collectionType = self.registry.getCollectionType(tag) 

1876 if collectionType is not CollectionType.TAGGED: 

1877 raise TypeError( 

1878 f"Cannot disassociate from collection '{tag}' " 

1879 f"of non-TAGGED type {collectionType.name}." 

1880 ) 

1881 # Transform possibly-single-pass iterable into something we can iterate 

1882 # over multiple times. 

1883 refs = list(refs) 

1884 # Pruning a component of a DatasetRef makes no sense since registry 

1885 # doesn't know about components and datastore might not store 

1886 # components in a separate file 

1887 for ref in refs: 

1888 if ref.datasetType.component(): 

1889 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1890 # We don't need an unreliable Datastore transaction for this, because 

1891 # we've been extra careful to ensure that Datastore.trash only involves 

1892 # mutating the Registry (it can _look_ at Datastore-specific things, 

1893 # but shouldn't change them), and hence all operations here are 

1894 # Registry operations. 

1895 with self.datastore.transaction(): 

1896 with self.registry.transaction(): 

1897 if unstore: 

1898 self.datastore.trash(refs) 

1899 if purge: 

1900 self.registry.removeDatasets(refs) 

1901 elif disassociate: 

1902 assert tags, "Guaranteed by earlier logic in this function." 

1903 for tag in tags: 

1904 self.registry.disassociate(tag, refs) 

1905 # We've exited the Registry transaction, and apparently committed. 

1906 # (if there was an exception, everything rolled back, and it's as if 

1907 # nothing happened - and we never get here). 

1908 # Datastore artifacts are not yet gone, but they're clearly marked 

1909 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1910 # problems we can try again later, and if manual administrative 

1911 # intervention is required, it's pretty clear what that should entail: 

1912 # deleting everything on disk and in private Datastore tables that is 

1913 # in the dataset_location_trash table. 

1914 if unstore: 

1915 # Point of no return for removing artifacts 

1916 self.datastore.emptyTrash() 

1917 

1918 @transactional 

1919 def ingest( 

1920 self, 

1921 *datasets: FileDataset, 

1922 transfer: Optional[str] = "auto", 

1923 run: Optional[str] = None, 

1924 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1925 record_validation_info: bool = True, 

1926 ) -> None: 

1927 """Store and register one or more datasets that already exist on disk. 

1928 

1929 Parameters 

1930 ---------- 

1931 datasets : `FileDataset` 

1932 Each positional argument is a struct containing information about 

1933 a file to be ingested, including its URI (either absolute or 

1934 relative to the datastore root, if applicable), a resolved 

1935 `DatasetRef`, and optionally a formatter class or its 

1936 fully-qualified string name. If a formatter is not provided, the 

1937 formatter that would be used for `put` is assumed. On successful 

1938 ingest all `FileDataset.formatter` attributes will be set to the 

1939 formatter class used. `FileDataset.path` attributes may be modified 

1940 to put paths in whatever the datastore considers a standardized 

1941 form. 

1942 transfer : `str`, optional 

1943 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1944 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1945 transfer the file. 

1946 run : `str`, optional 

1947 The name of the run ingested datasets should be added to, 

1948 overriding ``self.run``. This parameter is now deprecated since 

1949 the run is encoded in the ``FileDataset``. 

1950 idGenerationMode : `DatasetIdGenEnum`, optional 

1951 Specifies option for generating dataset IDs. By default unique IDs 

1952 are generated for each inserted dataset. 

1953 record_validation_info : `bool`, optional 

1954 If `True`, the default, the datastore can record validation 

1955 information associated with the file. If `False` the datastore 

1956 will not attempt to track any information such as checksums 

1957 or file sizes. This can be useful if such information is tracked 

1958 in an external system or if the file is to be compressed in place. 

1959 It is up to the datastore whether this parameter is relevant. 

1960 

1961 Raises 

1962 ------ 

1963 TypeError 

1964 Raised if the butler is read-only or if no run was provided. 

1965 NotImplementedError 

1966 Raised if the `Datastore` does not support the given transfer mode. 

1967 DatasetTypeNotSupportedError 

1968 Raised if one or more files to be ingested have a dataset type that 

1969 is not supported by the `Datastore`.. 

1970 FileNotFoundError 

1971 Raised if one of the given files does not exist. 

1972 FileExistsError 

1973 Raised if transfer is not `None` but the (internal) location the 

1974 file would be moved to is already occupied. 

1975 

1976 Notes 

1977 ----- 

1978 This operation is not fully exception safe: if a database operation 

1979 fails, the given `FileDataset` instances may be only partially updated. 

1980 

1981 It is atomic in terms of database operations (they will either all 

1982 succeed or all fail) providing the database engine implements 

1983 transactions correctly. It will attempt to be atomic in terms of 

1984 filesystem operations as well, but this cannot be implemented 

1985 rigorously for most datastores. 

1986 """ 

1987 if not self.isWriteable(): 

1988 raise TypeError("Butler is read-only.") 

1989 

1990 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1991 if not datasets: 

1992 return 

1993 

1994 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1995 

1996 # We need to reorganize all the inputs so that they are grouped 

1997 # by dataset type and run. Multiple refs in a single FileDataset 

1998 # are required to share the run and dataset type. 

1999 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

2000 groupedData: GroupedData = defaultdict(list) 

2001 

2002 # Track DataIDs that are being ingested so we can spot issues early 

2003 # with duplication. Retain previous FileDataset so we can report it. 

2004 groupedDataIds: MutableMapping[ 

2005 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

2006 ] = defaultdict(dict) 

2007 

2008 used_run = False 

2009 

2010 # And the nested loop that populates it: 

2011 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

2012 # Somewhere to store pre-existing refs if we have an 

2013 # execution butler. 

2014 existingRefs: List[DatasetRef] = [] 

2015 

2016 for ref in dataset.refs: 

2017 assert ref.run is not None # For mypy 

2018 group_key = (ref.datasetType, ref.run) 

2019 

2020 if ref.dataId in groupedDataIds[group_key]: 

2021 raise ConflictingDefinitionError( 

2022 f"Ingest conflict. Dataset {dataset.path} has same" 

2023 " DataId as other ingest dataset" 

2024 f" {groupedDataIds[group_key][ref.dataId].path} " 

2025 f" ({ref.dataId})" 

2026 ) 

2027 

2028 groupedDataIds[group_key][ref.dataId] = dataset 

2029 

2030 if existingRefs: 

2031 if len(dataset.refs) != len(existingRefs): 

2032 # Keeping track of partially pre-existing datasets is hard 

2033 # and should generally never happen. For now don't allow 

2034 # it. 

2035 raise ConflictingDefinitionError( 

2036 f"For dataset {dataset.path} some dataIds already exist" 

2037 " in registry but others do not. This is not supported." 

2038 ) 

2039 

2040 # Store expanded form in the original FileDataset. 

2041 dataset.refs = existingRefs 

2042 else: 

2043 groupedData[group_key].append(dataset) 

2044 

2045 if not used_run and run is not None: 

2046 warnings.warn( 

2047 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

2048 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

2049 category=FutureWarning, 

2050 stacklevel=3, # Take into account the @transactional decorator. 

2051 ) 

2052 

2053 # Now we can bulk-insert into Registry for each DatasetType. 

2054 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

2055 groupedData.items(), desc="Bulk-inserting datasets by type" 

2056 ): 

2057 refs_to_import = [] 

2058 for dataset in grouped_datasets: 

2059 refs_to_import.extend(dataset.refs) 

2060 

2061 n_refs = len(refs_to_import) 

2062 log.verbose( 

2063 "Importing %d ref%s of dataset type %r into run %r", 

2064 n_refs, 

2065 "" if n_refs == 1 else "s", 

2066 datasetType.name, 

2067 this_run, 

2068 ) 

2069 

2070 # Import the refs and expand the DataCoordinates since we can't 

2071 # guarantee that they are expanded and Datastore will need 

2072 # the records. 

2073 imported_refs = self.registry._importDatasets(refs_to_import, expand=True) 

2074 assert set(imported_refs) == set(refs_to_import) 

2075 

2076 # Replace all the refs in the FileDataset with expanded versions. 

2077 # Pull them off in the order we put them on the list. 

2078 for dataset in grouped_datasets: 

2079 n_dataset_refs = len(dataset.refs) 

2080 dataset.refs = imported_refs[:n_dataset_refs] 

2081 del imported_refs[:n_dataset_refs] 

2082 

2083 # Bulk-insert everything into Datastore. 

2084 # We do not know if any of the registry entries already existed 

2085 # (_importDatasets only complains if they exist but differ) so 

2086 # we have to catch IntegrityError explicitly. 

2087 try: 

2088 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

2089 except IntegrityError as e: 

2090 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") 

2091 

2092 @contextlib.contextmanager 

2093 def export( 

2094 self, 

2095 *, 

2096 directory: Optional[str] = None, 

2097 filename: Optional[str] = None, 

2098 format: Optional[str] = None, 

2099 transfer: Optional[str] = None, 

2100 ) -> Iterator[RepoExportContext]: 

2101 """Export datasets from the repository represented by this `Butler`. 

2102 

2103 This method is a context manager that returns a helper object 

2104 (`RepoExportContext`) that is used to indicate what information from 

2105 the repository should be exported. 

2106 

2107 Parameters 

2108 ---------- 

2109 directory : `str`, optional 

2110 Directory dataset files should be written to if ``transfer`` is not 

2111 `None`. 

2112 filename : `str`, optional 

2113 Name for the file that will include database information associated 

2114 with the exported datasets. If this is not an absolute path and 

2115 ``directory`` is not `None`, it will be written to ``directory`` 

2116 instead of the current working directory. Defaults to 

2117 "export.{format}". 

2118 format : `str`, optional 

2119 File format for the database information file. If `None`, the 

2120 extension of ``filename`` will be used. 

2121 transfer : `str`, optional 

2122 Transfer mode passed to `Datastore.export`. 

2123 

2124 Raises 

2125 ------ 

2126 TypeError 

2127 Raised if the set of arguments passed is inconsistent. 

2128 

2129 Examples 

2130 -------- 

2131 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2132 methods are used to provide the iterables over data IDs and/or datasets 

2133 to be exported:: 

2134 

2135 with butler.export("exports.yaml") as export: 

2136 # Export all flats, but none of the dimension element rows 

2137 # (i.e. data ID information) associated with them. 

2138 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2139 elements=()) 

2140 # Export all datasets that start with "deepCoadd_" and all of 

2141 # their associated data ID information. 

2142 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2143 """ 

2144 if directory is None and transfer is not None: 

2145 raise TypeError("Cannot transfer without providing a directory.") 

2146 if transfer == "move": 

2147 raise TypeError("Transfer may not be 'move': export is read-only") 

2148 if format is None: 

2149 if filename is None: 

2150 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2151 else: 

2152 _, format = os.path.splitext(filename) 

2153 if not format: 

2154 raise ValueError("Please specify a file extension to determine export format.") 

2155 format = format[1:] # Strip leading "."" 

2156 elif filename is None: 

2157 filename = f"export.{format}" 

2158 if directory is not None: 

2159 filename = os.path.join(directory, filename) 

2160 formats = self._config["repo_transfer_formats"] 

2161 if format not in formats: 

2162 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2163 BackendClass = get_class_of(formats[format, "export"]) 

2164 with open(filename, "w") as stream: 

2165 backend = BackendClass(stream, universe=self.registry.dimensions) 

2166 try: 

2167 helper = RepoExportContext( 

2168 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2169 ) 

2170 yield helper 

2171 except BaseException: 

2172 raise 

2173 else: 

2174 helper._finish() 

2175 

2176 def import_( 

2177 self, 

2178 *, 

2179 directory: Optional[ResourcePathExpression] = None, 

2180 filename: Union[ResourcePathExpression, TextIO, None] = None, 

2181 format: Optional[str] = None, 

2182 transfer: Optional[str] = None, 

2183 skip_dimensions: Optional[Set] = None, 

2184 ) -> None: 

2185 """Import datasets into this repository that were exported from a 

2186 different butler repository via `~lsst.daf.butler.Butler.export`. 

2187 

2188 Parameters 

2189 ---------- 

2190 directory : `~lsst.resources.ResourcePathExpression`, optional 

2191 Directory containing dataset files to import from. If `None`, 

2192 ``filename`` and all dataset file paths specified therein must 

2193 be absolute. 

2194 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2195 A stream or name of file that contains database information 

2196 associated with the exported datasets, typically generated by 

2197 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2198 `~lsst.resources.ResourcePath` and is not an absolute path, 

2199 it will first be looked for relative to ``directory`` and if not 

2200 found there it will be looked for in the current working 

2201 directory. Defaults to "export.{format}". 

2202 format : `str`, optional 

2203 File format for ``filename``. If `None`, the extension of 

2204 ``filename`` will be used. 

2205 transfer : `str`, optional 

2206 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2207 skip_dimensions : `set`, optional 

2208 Names of dimensions that should be skipped and not imported. 

2209 

2210 Raises 

2211 ------ 

2212 TypeError 

2213 Raised if the set of arguments passed is inconsistent, or if the 

2214 butler is read-only. 

2215 """ 

2216 if not self.isWriteable(): 

2217 raise TypeError("Butler is read-only.") 

2218 if format is None: 

2219 if filename is None: 

2220 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2221 else: 

2222 _, format = os.path.splitext(filename) # type: ignore 

2223 elif filename is None: 

2224 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2225 if directory is not None: 

2226 directory = ResourcePath(directory, forceDirectory=True) 

2227 # mypy doesn't think this will work but it does in python >= 3.10. 

2228 if isinstance(filename, ResourcePathExpression): # type: ignore 

2229 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2230 if not filename.isabs() and directory is not None: 

2231 potential = directory.join(filename) 

2232 exists_in_cwd = filename.exists() 

2233 exists_in_dir = potential.exists() 

2234 if exists_in_cwd and exists_in_dir: 

2235 log.warning( 

2236 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2237 "Additionally, the file exists relative to the given search directory (%s). " 

2238 "Using the export file in the given directory.", 

2239 filename, 

2240 potential, 

2241 ) 

2242 # Given they specified an explicit directory and that 

2243 # directory has the export file in it, assume that that 

2244 # is what was meant despite the file in cwd. 

2245 filename = potential 

2246 elif exists_in_dir: 

2247 filename = potential 

2248 elif not exists_in_cwd and not exists_in_dir: 

2249 # Raise early. 

2250 raise FileNotFoundError( 

2251 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2252 ) 

2253 BackendClass: type[RepoImportBackend] = get_class_of( 

2254 self._config["repo_transfer_formats"][format]["import"] 

2255 ) 

2256 

2257 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2258 backend = BackendClass(importStream, self.registry) # type: ignore[call-arg] 

2259 backend.register() 

2260 with self.transaction(): 

2261 backend.load( 

2262 self.datastore, 

2263 directory=directory, 

2264 transfer=transfer, 

2265 skip_dimensions=skip_dimensions, 

2266 ) 

2267 

2268 if isinstance(filename, ResourcePath): 

2269 # We can not use open() here at the moment because of 

2270 # DM-38589 since yaml does stream.read(8192) in a loop. 

2271 stream = io.StringIO(filename.read().decode()) 

2272 doImport(stream) 

2273 else: 

2274 doImport(filename) # type: ignore 

2275 

2276 def transfer_from( 

2277 self, 

2278 source_butler: LimitedButler, 

2279 source_refs: Iterable[DatasetRef], 

2280 transfer: str = "auto", 

2281 skip_missing: bool = True, 

2282 register_dataset_types: bool = False, 

2283 transfer_dimensions: bool = False, 

2284 ) -> collections.abc.Collection[DatasetRef]: 

2285 """Transfer datasets to this Butler from a run in another Butler. 

2286 

2287 Parameters 

2288 ---------- 

2289 source_butler : `LimitedButler` 

2290 Butler from which the datasets are to be transferred. If data IDs 

2291 in ``source_refs`` are not expanded then this has to be a full 

2292 `Butler` whose registry will be used to expand data IDs. 

2293 source_refs : iterable of `DatasetRef` 

2294 Datasets defined in the source butler that should be transferred to 

2295 this butler. 

2296 transfer : `str`, optional 

2297 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2298 skip_missing : `bool` 

2299 If `True`, datasets with no datastore artifact associated with 

2300 them are not transferred. If `False` a registry entry will be 

2301 created even if no datastore record is created (and so will 

2302 look equivalent to the dataset being unstored). 

2303 register_dataset_types : `bool` 

2304 If `True` any missing dataset types are registered. Otherwise 

2305 an exception is raised. 

2306 transfer_dimensions : `bool`, optional 

2307 If `True`, dimension record data associated with the new datasets 

2308 will be transferred. 

2309 

2310 Returns 

2311 ------- 

2312 refs : `list` of `DatasetRef` 

2313 The refs added to this Butler. 

2314 

2315 Notes 

2316 ----- 

2317 The datastore artifact has to exist for a transfer 

2318 to be made but non-existence is not an error. 

2319 

2320 Datasets that already exist in this run will be skipped. 

2321 

2322 The datasets are imported as part of a transaction, although 

2323 dataset types are registered before the transaction is started. 

2324 This means that it is possible for a dataset type to be registered 

2325 even though transfer has failed. 

2326 """ 

2327 if not self.isWriteable(): 

2328 raise TypeError("Butler is read-only.") 

2329 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2330 

2331 # Will iterate through the refs multiple times so need to convert 

2332 # to a list if this isn't a collection. 

2333 if not isinstance(source_refs, collections.abc.Collection): 

2334 source_refs = list(source_refs) 

2335 

2336 original_count = len(source_refs) 

2337 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2338 

2339 # In some situations the datastore artifact may be missing 

2340 # and we do not want that registry entry to be imported. 

2341 # Asking datastore is not sufficient, the records may have been 

2342 # purged, we have to ask for the (predicted) URI and check 

2343 # existence explicitly. Execution butler is set up exactly like 

2344 # this with no datastore records. 

2345 artifact_existence: Dict[ResourcePath, bool] = {} 

2346 if skip_missing: 

2347 dataset_existence = source_butler.datastore.mexists( 

2348 source_refs, artifact_existence=artifact_existence 

2349 ) 

2350 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2351 filtered_count = len(source_refs) 

2352 n_missing = original_count - filtered_count 

2353 log.verbose( 

2354 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2355 n_missing, 

2356 "" if n_missing == 1 else "s", 

2357 filtered_count, 

2358 ) 

2359 

2360 # Importing requires that we group the refs by dataset type and run 

2361 # before doing the import. 

2362 source_dataset_types = set() 

2363 grouped_refs = defaultdict(list) 

2364 for ref in source_refs: 

2365 grouped_refs[ref.datasetType, ref.run].append(ref) 

2366 source_dataset_types.add(ref.datasetType) 

2367 

2368 # Check to see if the dataset type in the source butler has 

2369 # the same definition in the target butler and register missing 

2370 # ones if requested. Registration must happen outside a transaction. 

2371 newly_registered_dataset_types = set() 

2372 for datasetType in source_dataset_types: 

2373 if register_dataset_types: 

2374 # Let this raise immediately if inconsistent. Continuing 

2375 # on to find additional inconsistent dataset types 

2376 # might result in additional unwanted dataset types being 

2377 # registered. 

2378 if self.registry.registerDatasetType(datasetType): 

2379 newly_registered_dataset_types.add(datasetType) 

2380 else: 

2381 # If the dataset type is missing, let it fail immediately. 

2382 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2383 if target_dataset_type != datasetType: 

2384 raise ConflictingDefinitionError( 

2385 "Source butler dataset type differs from definition" 

2386 f" in target butler: {datasetType} !=" 

2387 f" {target_dataset_type}" 

2388 ) 

2389 if newly_registered_dataset_types: 

2390 # We may have registered some even if there were inconsistencies 

2391 # but should let people know (or else remove them again). 

2392 log.log( 

2393 VERBOSE, 

2394 "Registered the following dataset types in the target Butler: %s", 

2395 ", ".join(d.name for d in newly_registered_dataset_types), 

2396 ) 

2397 else: 

2398 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2399 

2400 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2401 if transfer_dimensions: 

2402 # Collect all the dimension records for these refs. 

2403 # All dimensions are to be copied but the list of valid dimensions 

2404 # come from this butler's universe. 

2405 elements = frozenset( 

2406 element 

2407 for element in self.registry.dimensions.getStaticElements() 

2408 if element.hasTable() and element.viewOf is None 

2409 ) 

2410 dataIds = set(ref.dataId for ref in source_refs) 

2411 # This logic comes from saveDataIds. 

2412 for dataId in dataIds: 

2413 # Need an expanded record, if not expanded that we need a full 

2414 # butler with registry (allow mocks with registry too). 

2415 if not dataId.hasRecords(): 

2416 if registry := getattr(source_butler, "registry", None): 

2417 dataId = registry.expandDataId(dataId) 

2418 else: 

2419 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2420 # If this butler doesn't know about a dimension in the source 

2421 # butler things will break later. 

2422 for record in dataId.records.values(): 

2423 if record is not None and record.definition in elements: 

2424 dimension_records[record.definition].setdefault(record.dataId, record) 

2425 

2426 handled_collections: Set[str] = set() 

2427 

2428 # Do all the importing in a single transaction. 

2429 with self.transaction(): 

2430 if dimension_records: 

2431 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2432 for element, r in dimension_records.items(): 

2433 records = [r[dataId] for dataId in r] 

2434 # Assume that if the record is already present that we can 

2435 # use it without having to check that the record metadata 

2436 # is consistent. 

2437 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2438 

2439 n_imported = 0 

2440 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2441 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2442 ): 

2443 if run not in handled_collections: 

2444 # May need to create output collection. If source butler 

2445 # has a registry, ask for documentation string. 

2446 run_doc = None 

2447 if registry := getattr(source_butler, "registry", None): 

2448 run_doc = registry.getCollectionDocumentation(run) 

2449 registered = self.registry.registerRun(run, doc=run_doc) 

2450 handled_collections.add(run) 

2451 if registered: 

2452 log.log(VERBOSE, "Creating output run %s", run) 

2453 

2454 n_refs = len(refs_to_import) 

2455 log.verbose( 

2456 "Importing %d ref%s of dataset type %s into run %s", 

2457 n_refs, 

2458 "" if n_refs == 1 else "s", 

2459 datasetType.name, 

2460 run, 

2461 ) 

2462 

2463 # Assume we are using UUIDs and the source refs will match 

2464 # those imported. 

2465 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2466 assert set(imported_refs) == set(refs_to_import) 

2467 n_imported += len(imported_refs) 

2468 

2469 assert len(source_refs) == n_imported 

2470 log.verbose("Imported %d datasets into destination butler", n_imported) 

2471 

2472 # Ask the datastore to transfer. The datastore has to check that 

2473 # the source datastore is compatible with the target datastore. 

2474 accepted, rejected = self.datastore.transfer_from( 

2475 source_butler.datastore, 

2476 source_refs, 

2477 transfer=transfer, 

2478 artifact_existence=artifact_existence, 

2479 ) 

2480 if rejected: 

2481 # For now, accept the registry entries but not the files. 

2482 log.warning( 

2483 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2484 len(rejected), 

2485 len(accepted), 

2486 datasetType, 

2487 run, 

2488 ) 

2489 

2490 return source_refs 

2491 

2492 def validateConfiguration( 

2493 self, 

2494 logFailures: bool = False, 

2495 datasetTypeNames: Optional[Iterable[str]] = None, 

2496 ignore: Iterable[str] | None = None, 

2497 ) -> None: 

2498 """Validate butler configuration. 

2499 

2500 Checks that each `DatasetType` can be stored in the `Datastore`. 

2501 

2502 Parameters 

2503 ---------- 

2504 logFailures : `bool`, optional 

2505 If `True`, output a log message for every validation error 

2506 detected. 

2507 datasetTypeNames : iterable of `str`, optional 

2508 The `DatasetType` names that should be checked. This allows 

2509 only a subset to be selected. 

2510 ignore : iterable of `str`, optional 

2511 Names of DatasetTypes to skip over. This can be used to skip 

2512 known problems. If a named `DatasetType` corresponds to a 

2513 composite, all components of that `DatasetType` will also be 

2514 ignored. 

2515 

2516 Raises 

2517 ------ 

2518 ButlerValidationError 

2519 Raised if there is some inconsistency with how this Butler 

2520 is configured. 

2521 """ 

2522 if datasetTypeNames: 

2523 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2524 else: 

2525 datasetTypes = list(self.registry.queryDatasetTypes()) 

2526 

2527 # filter out anything from the ignore list 

2528 if ignore: 

2529 ignore = set(ignore) 

2530 datasetTypes = [ 

2531 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2532 ] 

2533 else: 

2534 ignore = set() 

2535 

2536 # Find all the registered instruments 

2537 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2538 

2539 # For each datasetType that has an instrument dimension, create 

2540 # a DatasetRef for each defined instrument 

2541 datasetRefs = [] 

2542 

2543 for datasetType in datasetTypes: 

2544 if "instrument" in datasetType.dimensions: 

2545 for instrument in instruments: 

2546 datasetRef = DatasetRef( 

2547 datasetType, 

2548 {"instrument": instrument}, # type: ignore 

2549 conform=False, 

2550 run="validate", 

2551 ) 

2552 datasetRefs.append(datasetRef) 

2553 

2554 entities: List[Union[DatasetType, DatasetRef]] = [] 

2555 entities.extend(datasetTypes) 

2556 entities.extend(datasetRefs) 

2557 

2558 datastoreErrorStr = None 

2559 try: 

2560 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2561 except ValidationError as e: 

2562 datastoreErrorStr = str(e) 

2563 

2564 # Also check that the LookupKeys used by the datastores match 

2565 # registry and storage class definitions 

2566 keys = self.datastore.getLookupKeys() 

2567 

2568 failedNames = set() 

2569 failedDataId = set() 

2570 for key in keys: 

2571 if key.name is not None: 

2572 if key.name in ignore: 

2573 continue 

2574 

2575 # skip if specific datasetType names were requested and this 

2576 # name does not match 

2577 if datasetTypeNames and key.name not in datasetTypeNames: 

2578 continue 

2579 

2580 # See if it is a StorageClass or a DatasetType 

2581 if key.name in self.storageClasses: 

2582 pass 

2583 else: 

2584 try: 

2585 self.registry.getDatasetType(key.name) 

2586 except KeyError: 

2587 if logFailures: 

2588 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2589 failedNames.add(key) 

2590 else: 

2591 # Dimensions are checked for consistency when the Butler 

2592 # is created and rendezvoused with a universe. 

2593 pass 

2594 

2595 # Check that the instrument is a valid instrument 

2596 # Currently only support instrument so check for that 

2597 if key.dataId: 

2598 dataIdKeys = set(key.dataId) 

2599 if set(["instrument"]) != dataIdKeys: 

2600 if logFailures: 

2601 log.critical("Key '%s' has unsupported DataId override", key) 

2602 failedDataId.add(key) 

2603 elif key.dataId["instrument"] not in instruments: 

2604 if logFailures: 

2605 log.critical("Key '%s' has unknown instrument", key) 

2606 failedDataId.add(key) 

2607 

2608 messages = [] 

2609 

2610 if datastoreErrorStr: 

2611 messages.append(datastoreErrorStr) 

2612 

2613 for failed, msg in ( 

2614 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2615 (failedDataId, "Keys with bad DataId entries: "), 

2616 ): 

2617 if failed: 

2618 msg += ", ".join(str(k) for k in failed) 

2619 messages.append(msg) 

2620 

2621 if messages: 

2622 raise ValidationError(";\n".join(messages)) 

2623 

2624 @property 

2625 def collections(self) -> Sequence[str]: 

2626 """The collections to search by default, in order 

2627 (`Sequence` [ `str` ]). 

2628 

2629 This is an alias for ``self.registry.defaults.collections``. It cannot 

2630 be set directly in isolation, but all defaults may be changed together 

2631 by assigning a new `RegistryDefaults` instance to 

2632 ``self.registry.defaults``. 

2633 """ 

2634 return self.registry.defaults.collections 

2635 

2636 @property 

2637 def run(self) -> Optional[str]: 

2638 """Name of the run this butler writes outputs to by default (`str` or 

2639 `None`). 

2640 

2641 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2642 directly in isolation, but all defaults may be changed together by 

2643 assigning a new `RegistryDefaults` instance to 

2644 ``self.registry.defaults``. 

2645 """ 

2646 return self.registry.defaults.run 

2647 

2648 @property 

2649 def dimensions(self) -> DimensionUniverse: 

2650 # Docstring inherited. 

2651 return self.registry.dimensions 

2652 

2653 registry: Registry 

2654 """The object that manages dataset metadata and relationships (`Registry`). 

2655 

2656 Most operations that don't involve reading or writing butler datasets are 

2657 accessible only via `Registry` methods. 

2658 """ 

2659 

2660 datastore: Datastore 

2661 """The object that manages actual dataset storage (`Datastore`). 

2662 

2663 Direct user access to the datastore should rarely be necessary; the primary 

2664 exception is the case where a `Datastore` implementation provides extra 

2665 functionality beyond what the base class defines. 

2666 """ 

2667 

2668 storageClasses: StorageClassFactory 

2669 """An object that maps known storage class names to objects that fully 

2670 describe them (`StorageClassFactory`). 

2671 """