Coverage for python/lsst/daf/butler/_butler.py: 12%

733 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-12 09:44 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "Butler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import logging 

41import numbers 

42import os 

43import warnings 

44from collections import Counter, defaultdict 

45from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

46from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

47 

48from deprecated.sphinx import deprecated 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils import doImportType 

51from lsst.utils.introspection import get_class_of 

52from lsst.utils.logging import VERBOSE, getLogger 

53from sqlalchemy.exc import IntegrityError 

54 

55from ._butler_config import ButlerConfig 

56from ._butler_repo_index import ButlerRepoIndex 

57from ._config import Config, ConfigSubset 

58from ._dataset_existence import DatasetExistence 

59from ._dataset_ref import DatasetIdGenEnum, DatasetRef 

60from ._dataset_type import DatasetType 

61from ._deferredDatasetHandle import DeferredDatasetHandle 

62from ._exceptions import ValidationError 

63from ._file_dataset import FileDataset 

64from ._limited_butler import LimitedButler 

65from ._registry_shim import RegistryShim 

66from ._storage_class import StorageClass, StorageClassFactory 

67from ._timespan import Timespan 

68from .datastore import DatasetRefURIs, Datastore, NullDatastore 

69from .dimensions import ( 

70 DataCoordinate, 

71 DataId, 

72 DataIdValue, 

73 Dimension, 

74 DimensionConfig, 

75 DimensionElement, 

76 DimensionRecord, 

77 DimensionUniverse, 

78) 

79from .progress import Progress 

80from .registry import ( 

81 CollectionType, 

82 ConflictingDefinitionError, 

83 DataIdError, 

84 MissingDatasetTypeError, 

85 NoDefaultCollectionError, 

86 Registry, 

87 RegistryConfig, 

88 RegistryDefaults, 

89 _ButlerRegistry, 

90 _RegistryFactory, 

91) 

92from .repo_relocation import BUTLER_ROOT_TAG 

93from .transfers import RepoExportContext 

94from .utils import transactional 

95 

96if TYPE_CHECKING: 

97 from lsst.resources import ResourceHandleProtocol 

98 

99 from .transfers import RepoImportBackend 

100 

101log = getLogger(__name__) 

102 

103 

104class ButlerValidationError(ValidationError): 

105 """There is a problem with the Butler configuration.""" 

106 

107 pass 

108 

109 

110class Butler(LimitedButler): 

111 """Main entry point for the data access system. 

112 

113 Parameters 

114 ---------- 

115 config : `ButlerConfig`, `Config` or `str`, optional. 

116 Configuration. Anything acceptable to the 

117 `ButlerConfig` constructor. If a directory path 

118 is given the configuration will be read from a ``butler.yaml`` file in 

119 that location. If `None` is given default values will be used. 

120 butler : `Butler`, optional. 

121 If provided, construct a new Butler that uses the same registry and 

122 datastore as the given one, but with the given collection and run. 

123 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

124 arguments. 

125 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

126 An expression specifying the collections to be searched (in order) when 

127 reading datasets. 

128 This may be a `str` collection name or an iterable thereof. 

129 See :ref:`daf_butler_collection_expressions` for more information. 

130 These collections are not registered automatically and must be 

131 manually registered before they are used by any method, but they may be 

132 manually registered after the `Butler` is initialized. 

133 run : `str`, optional 

134 Name of the `~CollectionType.RUN` collection new datasets should be 

135 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

136 ``collections`` will be set to ``[run]``. If not `None`, this 

137 collection will automatically be registered. If this is not set (and 

138 ``writeable`` is not set either), a read-only butler will be created. 

139 searchPaths : `list` of `str`, optional 

140 Directory paths to search when calculating the full Butler 

141 configuration. Not used if the supplied config is already a 

142 `ButlerConfig`. 

143 writeable : `bool`, optional 

144 Explicitly sets whether the butler supports write operations. If not 

145 provided, a read-write butler is created if any of ``run``, ``tags``, 

146 or ``chains`` is non-empty. 

147 inferDefaults : `bool`, optional 

148 If `True` (default) infer default data ID values from the values 

149 present in the datasets in ``collections``: if all collections have the 

150 same value (or no value) for a governor dimension, that value will be 

151 the default for that dimension. Nonexistent collections are ignored. 

152 If a default value is provided explicitly for a governor dimension via 

153 ``**kwargs``, no default will be inferred for that dimension. 

154 without_datastore : `bool`, optional 

155 If `True` do not attach a datastore to this butler. Any attempts 

156 to use a datastore will fail. 

157 **kwargs : `str` 

158 Default data ID key-value pairs. These may only identify "governor" 

159 dimensions like ``instrument`` and ``skymap``. 

160 

161 Examples 

162 -------- 

163 While there are many ways to control exactly how a `Butler` interacts with 

164 the collections in its `Registry`, the most common cases are still simple. 

165 

166 For a read-only `Butler` that searches one collection, do:: 

167 

168 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

169 

170 For a read-write `Butler` that writes to and reads from a 

171 `~CollectionType.RUN` collection:: 

172 

173 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

174 

175 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

176 because we want to write to one `~CollectionType.RUN` collection but read 

177 from several others (as well):: 

178 

179 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

180 collections=["u/alice/DM-50000/a", 

181 "u/bob/DM-49998", 

182 "HSC/defaults"]) 

183 

184 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

185 Datasets will be read first from that run (since it appears first in the 

186 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

187 

188 Finally, one can always create a `Butler` with no collections:: 

189 

190 butler = Butler("/path/to/repo", writeable=True) 

191 

192 This can be extremely useful when you just want to use ``butler.registry``, 

193 e.g. for inserting dimension data or managing collections, or when the 

194 collections you want to use with the butler are not consistent. 

195 Passing ``writeable`` explicitly here is only necessary if you want to be 

196 able to make changes to the repo - usually the value for ``writeable`` can 

197 be guessed from the collection arguments provided, but it defaults to 

198 `False` when there are not collection arguments. 

199 """ 

200 

201 def __init__( 

202 self, 

203 config: Config | ResourcePathExpression | None = None, 

204 *, 

205 butler: Butler | None = None, 

206 collections: Any = None, 

207 run: str | None = None, 

208 searchPaths: Sequence[ResourcePathExpression] | None = None, 

209 writeable: bool | None = None, 

210 inferDefaults: bool = True, 

211 without_datastore: bool = False, 

212 **kwargs: str, 

213 ): 

214 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

215 # Load registry, datastore, etc. from config or existing butler. 

216 if butler is not None: 

217 if config is not None or searchPaths is not None or writeable is not None: 

218 raise TypeError( 

219 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

220 ) 

221 self._registry = butler._registry.copy(defaults) 

222 self._datastore = butler._datastore 

223 self.storageClasses = butler.storageClasses 

224 self._config: ButlerConfig = butler._config 

225 else: 

226 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore) 

227 try: 

228 butlerRoot = self._config.get("root", self._config.configDir) 

229 if writeable is None: 

230 writeable = run is not None 

231 self._registry = _RegistryFactory(self._config).from_config( 

232 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

233 ) 

234 if without_datastore: 

235 self._datastore = NullDatastore(None, None) 

236 else: 

237 self._datastore = Datastore.fromConfig( 

238 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

239 ) 

240 self.storageClasses = StorageClassFactory() 

241 self.storageClasses.addFromConfig(self._config) 

242 except Exception: 

243 # Failures here usually mean that configuration is incomplete, 

244 # just issue an error message which includes config file URI. 

245 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

246 raise 

247 

248 # For execution butler the datastore needs a special 

249 # dependency-inversion trick. This is not used by regular butler, 

250 # but we do not have a way to distinguish regular butler from execution 

251 # butler. 

252 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

253 

254 if "run" in self._config or "collection" in self._config: 

255 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

256 

257 self._registry_shim = RegistryShim(self) 

258 

259 GENERATION: ClassVar[int] = 3 

260 """This is a Generation 3 Butler. 

261 

262 This attribute may be removed in the future, once the Generation 2 Butler 

263 interface has been fully retired; it should only be used in transitional 

264 code. 

265 """ 

266 

267 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

268 """Return DatasetType defined in registry given dataset type name.""" 

269 try: 

270 return self._registry.getDatasetType(name) 

271 except MissingDatasetTypeError: 

272 return None 

273 

274 @classmethod 

275 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: 

276 """Look up the label in a butler repository index. 

277 

278 Parameters 

279 ---------- 

280 label : `str` 

281 Label of the Butler repository to look up. 

282 return_label : `bool`, optional 

283 If ``label`` cannot be found in the repository index (either 

284 because index is not defined or ``label`` is not in the index) and 

285 ``return_label`` is `True` then return ``ResourcePath(label)``. 

286 If ``return_label`` is `False` (default) then an exception will be 

287 raised instead. 

288 

289 Returns 

290 ------- 

291 uri : `lsst.resources.ResourcePath` 

292 URI to the Butler repository associated with the given label or 

293 default value if it is provided. 

294 

295 Raises 

296 ------ 

297 KeyError 

298 Raised if the label is not found in the index, or if an index 

299 is not defined, and ``return_label`` is `False`. 

300 

301 Notes 

302 ----- 

303 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

304 information is discovered. 

305 """ 

306 return ButlerRepoIndex.get_repo_uri(label, return_label) 

307 

308 @classmethod 

309 def get_known_repos(cls) -> set[str]: 

310 """Retrieve the list of known repository labels. 

311 

312 Returns 

313 ------- 

314 repos : `set` of `str` 

315 All the known labels. Can be empty if no index can be found. 

316 

317 Notes 

318 ----- 

319 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

320 information is discovered. 

321 """ 

322 return ButlerRepoIndex.get_known_repos() 

323 

324 @staticmethod 

325 def makeRepo( 

326 root: ResourcePathExpression, 

327 config: Config | str | None = None, 

328 dimensionConfig: Config | str | None = None, 

329 standalone: bool = False, 

330 searchPaths: list[str] | None = None, 

331 forceConfigRoot: bool = True, 

332 outfile: ResourcePathExpression | None = None, 

333 overwrite: bool = False, 

334 ) -> Config: 

335 """Create an empty data repository by adding a butler.yaml config 

336 to a repository root directory. 

337 

338 Parameters 

339 ---------- 

340 root : `lsst.resources.ResourcePathExpression` 

341 Path or URI to the root location of the new repository. Will be 

342 created if it does not exist. 

343 config : `Config` or `str`, optional 

344 Configuration to write to the repository, after setting any 

345 root-dependent Registry or Datastore config options. Can not 

346 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

347 configuration will be used. Root-dependent config options 

348 specified in this config are overwritten if ``forceConfigRoot`` 

349 is `True`. 

350 dimensionConfig : `Config` or `str`, optional 

351 Configuration for dimensions, will be used to initialize registry 

352 database. 

353 standalone : `bool` 

354 If True, write all expanded defaults, not just customized or 

355 repository-specific settings. 

356 This (mostly) decouples the repository from the default 

357 configuration, insulating it from changes to the defaults (which 

358 may be good or bad, depending on the nature of the changes). 

359 Future *additions* to the defaults will still be picked up when 

360 initializing `Butlers` to repos created with ``standalone=True``. 

361 searchPaths : `list` of `str`, optional 

362 Directory paths to search when calculating the full butler 

363 configuration. 

364 forceConfigRoot : `bool`, optional 

365 If `False`, any values present in the supplied ``config`` that 

366 would normally be reset are not overridden and will appear 

367 directly in the output config. This allows non-standard overrides 

368 of the root directory for a datastore or registry to be given. 

369 If this parameter is `True` the values for ``root`` will be 

370 forced into the resulting config if appropriate. 

371 outfile : `lss.resources.ResourcePathExpression`, optional 

372 If not-`None`, the output configuration will be written to this 

373 location rather than into the repository itself. Can be a URI 

374 string. Can refer to a directory that will be used to write 

375 ``butler.yaml``. 

376 overwrite : `bool`, optional 

377 Create a new configuration file even if one already exists 

378 in the specified output location. Default is to raise 

379 an exception. 

380 

381 Returns 

382 ------- 

383 config : `Config` 

384 The updated `Config` instance written to the repo. 

385 

386 Raises 

387 ------ 

388 ValueError 

389 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

390 regular Config (as these subclasses would make it impossible to 

391 support ``standalone=False``). 

392 FileExistsError 

393 Raised if the output config file already exists. 

394 os.error 

395 Raised if the directory does not exist, exists but is not a 

396 directory, or cannot be created. 

397 

398 Notes 

399 ----- 

400 Note that when ``standalone=False`` (the default), the configuration 

401 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

402 construct the repository should also be used to construct any Butlers 

403 to avoid configuration inconsistencies. 

404 """ 

405 if isinstance(config, ButlerConfig | ConfigSubset): 

406 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

407 

408 # Ensure that the root of the repository exists or can be made 

409 root_uri = ResourcePath(root, forceDirectory=True) 

410 root_uri.mkdir() 

411 

412 config = Config(config) 

413 

414 # If we are creating a new repo from scratch with relative roots, 

415 # do not propagate an explicit root from the config file 

416 if "root" in config: 

417 del config["root"] 

418 

419 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

420 imported_class = doImportType(full["datastore", "cls"]) 

421 if not issubclass(imported_class, Datastore): 

422 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

423 datastoreClass: type[Datastore] = imported_class 

424 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

425 

426 # if key exists in given config, parse it, otherwise parse the defaults 

427 # in the expanded config 

428 if config.get(("registry", "db")): 

429 registryConfig = RegistryConfig(config) 

430 else: 

431 registryConfig = RegistryConfig(full) 

432 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

433 if defaultDatabaseUri is not None: 

434 Config.updateParameters( 

435 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

436 ) 

437 else: 

438 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

439 

440 if standalone: 

441 config.merge(full) 

442 else: 

443 # Always expand the registry.managers section into the per-repo 

444 # config, because after the database schema is created, it's not 

445 # allowed to change anymore. Note that in the standalone=True 

446 # branch, _everything_ in the config is expanded, so there's no 

447 # need to special case this. 

448 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

449 configURI: ResourcePathExpression 

450 if outfile is not None: 

451 # When writing to a separate location we must include 

452 # the root of the butler repo in the config else it won't know 

453 # where to look. 

454 config["root"] = root_uri.geturl() 

455 configURI = outfile 

456 else: 

457 configURI = root_uri 

458 # Strip obscore configuration, if it is present, before writing config 

459 # to a file, obscore config will be stored in registry. 

460 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

461 config_to_write = config.copy() 

462 del config_to_write[obscore_config_key] 

463 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

464 # configFile attribute is updated, need to copy it to original. 

465 config.configFile = config_to_write.configFile 

466 else: 

467 config.dumpToUri(configURI, overwrite=overwrite) 

468 

469 # Create Registry and populate tables 

470 registryConfig = RegistryConfig(config.get("registry")) 

471 dimensionConfig = DimensionConfig(dimensionConfig) 

472 _RegistryFactory(registryConfig).create_from_config( 

473 dimensionConfig=dimensionConfig, butlerRoot=root_uri 

474 ) 

475 

476 log.verbose("Wrote new Butler configuration file to %s", configURI) 

477 

478 return config 

479 

480 @classmethod 

481 def _unpickle( 

482 cls, 

483 config: ButlerConfig, 

484 collections: tuple[str, ...] | None, 

485 run: str | None, 

486 defaultDataId: dict[str, str], 

487 writeable: bool, 

488 ) -> Butler: 

489 """Callable used to unpickle a Butler. 

490 

491 We prefer not to use ``Butler.__init__`` directly so we can force some 

492 of its many arguments to be keyword-only (note that ``__reduce__`` 

493 can only invoke callables with positional arguments). 

494 

495 Parameters 

496 ---------- 

497 config : `ButlerConfig` 

498 Butler configuration, already coerced into a true `ButlerConfig` 

499 instance (and hence after any search paths for overrides have been 

500 utilized). 

501 collections : `tuple` [ `str` ] 

502 Names of the default collections to read from. 

503 run : `str`, optional 

504 Name of the default `~CollectionType.RUN` collection to write to. 

505 defaultDataId : `dict` [ `str`, `str` ] 

506 Default data ID values. 

507 writeable : `bool` 

508 Whether the Butler should support write operations. 

509 

510 Returns 

511 ------- 

512 butler : `Butler` 

513 A new `Butler` instance. 

514 """ 

515 # MyPy doesn't recognize that the kwargs below are totally valid; it 

516 # seems to think '**defaultDataId* is a _positional_ argument! 

517 return cls( 

518 config=config, 

519 collections=collections, 

520 run=run, 

521 writeable=writeable, 

522 **defaultDataId, # type: ignore 

523 ) 

524 

525 def __reduce__(self) -> tuple: 

526 """Support pickling.""" 

527 return ( 

528 Butler._unpickle, 

529 ( 

530 self._config, 

531 self.collections, 

532 self.run, 

533 self._registry.defaults.dataId.byName(), 

534 self._registry.isWriteable(), 

535 ), 

536 ) 

537 

538 def __str__(self) -> str: 

539 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

540 self.collections, self.run, self._datastore, self._registry 

541 ) 

542 

543 def isWriteable(self) -> bool: 

544 """Return `True` if this `Butler` supports write operations.""" 

545 return self._registry.isWriteable() 

546 

547 @contextlib.contextmanager 

548 def transaction(self) -> Iterator[None]: 

549 """Context manager supporting `Butler` transactions. 

550 

551 Transactions can be nested. 

552 """ 

553 with self._registry.transaction(), self._datastore.transaction(): 

554 yield 

555 

556 def _standardizeArgs( 

557 self, 

558 datasetRefOrType: DatasetRef | DatasetType | str, 

559 dataId: DataId | None = None, 

560 for_put: bool = True, 

561 **kwargs: Any, 

562 ) -> tuple[DatasetType, DataId | None]: 

563 """Standardize the arguments passed to several Butler APIs. 

564 

565 Parameters 

566 ---------- 

567 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

568 When `DatasetRef` the `dataId` should be `None`. 

569 Otherwise the `DatasetType` or name thereof. 

570 dataId : `dict` or `DataCoordinate` 

571 A `dict` of `Dimension` link name, value pairs that label the 

572 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

573 should be provided as the second argument. 

574 for_put : `bool`, optional 

575 If `True` this call is invoked as part of a `Butler.put()`. 

576 Otherwise it is assumed to be part of a `Butler.get()`. This 

577 parameter is only relevant if there is dataset type 

578 inconsistency. 

579 **kwargs 

580 Additional keyword arguments used to augment or construct a 

581 `DataCoordinate`. See `DataCoordinate.standardize` 

582 parameters. 

583 

584 Returns 

585 ------- 

586 datasetType : `DatasetType` 

587 A `DatasetType` instance extracted from ``datasetRefOrType``. 

588 dataId : `dict` or `DataId`, optional 

589 Argument that can be used (along with ``kwargs``) to construct a 

590 `DataId`. 

591 

592 Notes 

593 ----- 

594 Butler APIs that conceptually need a DatasetRef also allow passing a 

595 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

596 keyword arguments that can be used to construct one) separately. This 

597 method accepts those arguments and always returns a true `DatasetType` 

598 and a `DataId` or `dict`. 

599 

600 Standardization of `dict` vs `DataId` is best handled by passing the 

601 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

602 generally similarly flexible. 

603 """ 

604 externalDatasetType: DatasetType | None = None 

605 internalDatasetType: DatasetType | None = None 

606 if isinstance(datasetRefOrType, DatasetRef): 

607 if dataId is not None or kwargs: 

608 raise ValueError("DatasetRef given, cannot use dataId as well") 

609 externalDatasetType = datasetRefOrType.datasetType 

610 dataId = datasetRefOrType.dataId 

611 else: 

612 # Don't check whether DataId is provided, because Registry APIs 

613 # can usually construct a better error message when it wasn't. 

614 if isinstance(datasetRefOrType, DatasetType): 

615 externalDatasetType = datasetRefOrType 

616 else: 

617 internalDatasetType = self._registry.getDatasetType(datasetRefOrType) 

618 

619 # Check that they are self-consistent 

620 if externalDatasetType is not None: 

621 internalDatasetType = self._registry.getDatasetType(externalDatasetType.name) 

622 if externalDatasetType != internalDatasetType: 

623 # We can allow differences if they are compatible, depending 

624 # on whether this is a get or a put. A get requires that 

625 # the python type associated with the datastore can be 

626 # converted to the user type. A put requires that the user 

627 # supplied python type can be converted to the internal 

628 # type expected by registry. 

629 relevantDatasetType = internalDatasetType 

630 if for_put: 

631 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

632 else: 

633 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

634 relevantDatasetType = externalDatasetType 

635 if not is_compatible: 

636 raise ValueError( 

637 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

638 f"registry definition ({internalDatasetType})" 

639 ) 

640 # Override the internal definition. 

641 internalDatasetType = relevantDatasetType 

642 

643 assert internalDatasetType is not None 

644 return internalDatasetType, dataId 

645 

646 def _rewrite_data_id( 

647 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

648 ) -> tuple[DataId | None, dict[str, Any]]: 

649 """Rewrite a data ID taking into account dimension records. 

650 

651 Take a Data ID and keyword args and rewrite it if necessary to 

652 allow the user to specify dimension records rather than dimension 

653 primary values. 

654 

655 This allows a user to include a dataId dict with keys of 

656 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

657 the integer exposure ID. It also allows a string to be given 

658 for a dimension value rather than the integer ID if that is more 

659 convenient. For example, rather than having to specifying the 

660 detector with ``detector.full_name``, a string given for ``detector`` 

661 will be interpreted as the full name and converted to the integer 

662 value. 

663 

664 Keyword arguments can also use strings for dimensions like detector 

665 and exposure but python does not allow them to include ``.`` and 

666 so the ``exposure.day_obs`` syntax can not be used in a keyword 

667 argument. 

668 

669 Parameters 

670 ---------- 

671 dataId : `dict` or `DataCoordinate` 

672 A `dict` of `Dimension` link name, value pairs that will label the 

673 `DatasetRef` within a Collection. 

674 datasetType : `DatasetType` 

675 The dataset type associated with this dataId. Required to 

676 determine the relevant dimensions. 

677 **kwargs 

678 Additional keyword arguments used to augment or construct a 

679 `DataId`. See `DataId` parameters. 

680 

681 Returns 

682 ------- 

683 dataId : `dict` or `DataCoordinate` 

684 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

685 no keyword arguments, the original dataId will be returned 

686 unchanged. 

687 **kwargs : `dict` 

688 Any unused keyword arguments (would normally be empty dict). 

689 """ 

690 # Do nothing if we have a standalone DataCoordinate. 

691 if isinstance(dataId, DataCoordinate) and not kwargs: 

692 return dataId, kwargs 

693 

694 # Process dimension records that are using record information 

695 # rather than ids 

696 newDataId: dict[str, DataIdValue] = {} 

697 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

698 

699 # if all the dataId comes from keyword parameters we do not need 

700 # to do anything here because they can't be of the form 

701 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

702 if dataId: 

703 for k, v in dataId.items(): 

704 # If we have a Dimension we do not need to do anything 

705 # because it cannot be a compound key. 

706 if isinstance(k, str) and "." in k: 

707 # Someone is using a more human-readable dataId 

708 dimensionName, record = k.split(".", 1) 

709 byRecord[dimensionName][record] = v 

710 elif isinstance(k, Dimension): 

711 newDataId[k.name] = v 

712 else: 

713 newDataId[k] = v 

714 

715 # Go through the updated dataId and check the type in case someone is 

716 # using an alternate key. We have already filtered out the compound 

717 # keys dimensions.record format. 

718 not_dimensions = {} 

719 

720 # Will need to look in the dataId and the keyword arguments 

721 # and will remove them if they need to be fixed or are unrecognized. 

722 for dataIdDict in (newDataId, kwargs): 

723 # Use a list so we can adjust the dict safely in the loop 

724 for dimensionName in list(dataIdDict): 

725 value = dataIdDict[dimensionName] 

726 try: 

727 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

728 except KeyError: 

729 # This is not a real dimension 

730 not_dimensions[dimensionName] = value 

731 del dataIdDict[dimensionName] 

732 continue 

733 

734 # Convert an integral type to an explicit int to simplify 

735 # comparisons here 

736 if isinstance(value, numbers.Integral): 

737 value = int(value) 

738 

739 if not isinstance(value, dimension.primaryKey.getPythonType()): 

740 for alternate in dimension.alternateKeys: 

741 if isinstance(value, alternate.getPythonType()): 

742 byRecord[dimensionName][alternate.name] = value 

743 del dataIdDict[dimensionName] 

744 log.debug( 

745 "Converting dimension %s to %s.%s=%s", 

746 dimensionName, 

747 dimensionName, 

748 alternate.name, 

749 value, 

750 ) 

751 break 

752 else: 

753 log.warning( 

754 "Type mismatch found for value '%r' provided for dimension %s. " 

755 "Could not find matching alternative (primary key has type %s) " 

756 "so attempting to use as-is.", 

757 value, 

758 dimensionName, 

759 dimension.primaryKey.getPythonType(), 

760 ) 

761 

762 # By this point kwargs and newDataId should only include valid 

763 # dimensions. Merge kwargs in to the new dataId and log if there 

764 # are dimensions in both (rather than calling update). 

765 for k, v in kwargs.items(): 

766 if k in newDataId and newDataId[k] != v: 

767 log.debug( 

768 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

769 ) 

770 newDataId[k] = v 

771 # No need to retain any values in kwargs now. 

772 kwargs = {} 

773 

774 # If we have some unrecognized dimensions we have to try to connect 

775 # them to records in other dimensions. This is made more complicated 

776 # by some dimensions having records with clashing names. A mitigation 

777 # is that we can tell by this point which dimensions are missing 

778 # for the DatasetType but this does not work for calibrations 

779 # where additional dimensions can be used to constrain the temporal 

780 # axis. 

781 if not_dimensions: 

782 # Search for all dimensions even if we have been given a value 

783 # explicitly. In some cases records are given as well as the 

784 # actually dimension and this should not be an error if they 

785 # match. 

786 mandatoryDimensions = datasetType.dimensions.names # - provided 

787 

788 candidateDimensions: set[str] = set() 

789 candidateDimensions.update(mandatoryDimensions) 

790 

791 # For calibrations we may well be needing temporal dimensions 

792 # so rather than always including all dimensions in the scan 

793 # restrict things a little. It is still possible for there 

794 # to be confusion over day_obs in visit vs exposure for example. 

795 # If we are not searching calibration collections things may 

796 # fail but they are going to fail anyway because of the 

797 # ambiguousness of the dataId... 

798 if datasetType.isCalibration(): 

799 for dim in self.dimensions.getStaticDimensions(): 

800 if dim.temporal: 

801 candidateDimensions.add(str(dim)) 

802 

803 # Look up table for the first association with a dimension 

804 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

805 

806 # Keep track of whether an item is associated with multiple 

807 # dimensions. 

808 counter: Counter[str] = Counter() 

809 assigned: dict[str, set[str]] = defaultdict(set) 

810 

811 # Go through the missing dimensions and associate the 

812 # given names with records within those dimensions 

813 matched_dims = set() 

814 for dimensionName in candidateDimensions: 

815 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

816 fields = dimension.metadata.names | dimension.uniqueKeys.names 

817 for field in not_dimensions: 

818 if field in fields: 

819 guessedAssociation[dimensionName][field] = not_dimensions[field] 

820 counter[dimensionName] += 1 

821 assigned[field].add(dimensionName) 

822 matched_dims.add(field) 

823 

824 # Calculate the fields that matched nothing. 

825 never_found = set(not_dimensions) - matched_dims 

826 

827 if never_found: 

828 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

829 

830 # There is a chance we have allocated a single dataId item 

831 # to multiple dimensions. Need to decide which should be retained. 

832 # For now assume that the most popular alternative wins. 

833 # This means that day_obs with seq_num will result in 

834 # exposure.day_obs and not visit.day_obs 

835 # Also prefer an explicitly missing dimension over an inferred 

836 # temporal dimension. 

837 for fieldName, assignedDimensions in assigned.items(): 

838 if len(assignedDimensions) > 1: 

839 # Pick the most popular (preferring mandatory dimensions) 

840 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

841 if requiredButMissing: 

842 candidateDimensions = requiredButMissing 

843 else: 

844 candidateDimensions = assignedDimensions 

845 

846 # If this is a choice between visit and exposure and 

847 # neither was a required part of the dataset type, 

848 # (hence in this branch) always prefer exposure over 

849 # visit since exposures are always defined and visits 

850 # are defined from exposures. 

851 if candidateDimensions == {"exposure", "visit"}: 

852 candidateDimensions = {"exposure"} 

853 

854 # Select the relevant items and get a new restricted 

855 # counter. 

856 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

857 duplicatesCounter: Counter[str] = Counter() 

858 duplicatesCounter.update(theseCounts) 

859 

860 # Choose the most common. If they are equally common 

861 # we will pick the one that was found first. 

862 # Returns a list of tuples 

863 selected = duplicatesCounter.most_common(1)[0][0] 

864 

865 log.debug( 

866 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

867 " Removed ambiguity by choosing dimension %s.", 

868 fieldName, 

869 ", ".join(assignedDimensions), 

870 selected, 

871 ) 

872 

873 for candidateDimension in assignedDimensions: 

874 if candidateDimension != selected: 

875 del guessedAssociation[candidateDimension][fieldName] 

876 

877 # Update the record look up dict with the new associations 

878 for dimensionName, values in guessedAssociation.items(): 

879 if values: # A dict might now be empty 

880 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

881 byRecord[dimensionName].update(values) 

882 

883 if byRecord: 

884 # Some record specifiers were found so we need to convert 

885 # them to the Id form 

886 for dimensionName, values in byRecord.items(): 

887 if dimensionName in newDataId: 

888 log.debug( 

889 "DataId specified explicit %s dimension value of %s in addition to" 

890 " general record specifiers for it of %s. Ignoring record information.", 

891 dimensionName, 

892 newDataId[dimensionName], 

893 str(values), 

894 ) 

895 # Get the actual record and compare with these values. 

896 try: 

897 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

898 except DataIdError: 

899 raise ValueError( 

900 f"Could not find dimension '{dimensionName}'" 

901 f" with dataId {newDataId} as part of comparing with" 

902 f" record values {byRecord[dimensionName]}" 

903 ) from None 

904 if len(recs) == 1: 

905 errmsg: list[str] = [] 

906 for k, v in values.items(): 

907 if (recval := getattr(recs[0], k)) != v: 

908 errmsg.append(f"{k}({recval} != {v})") 

909 if errmsg: 

910 raise ValueError( 

911 f"Dimension {dimensionName} in dataId has explicit value" 

912 " inconsistent with records: " + ", ".join(errmsg) 

913 ) 

914 else: 

915 # Multiple matches for an explicit dimension 

916 # should never happen but let downstream complain. 

917 pass 

918 continue 

919 

920 # Build up a WHERE expression 

921 bind = dict(values.items()) 

922 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

923 

924 # Hopefully we get a single record that matches 

925 records = set( 

926 self._registry.queryDimensionRecords( 

927 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

928 ) 

929 ) 

930 

931 if len(records) != 1: 

932 if len(records) > 1: 

933 # visit can have an ambiguous answer without involving 

934 # visit_system. The default visit_system is defined 

935 # by the instrument. 

936 if ( 

937 dimensionName == "visit" 

938 and "visit_system_membership" in self.dimensions 

939 and "visit_system" in self.dimensions["instrument"].metadata 

940 ): 

941 instrument_records = list( 

942 self._registry.queryDimensionRecords( 

943 "instrument", 

944 dataId=newDataId, 

945 **kwargs, 

946 ) 

947 ) 

948 if len(instrument_records) == 1: 

949 visit_system = instrument_records[0].visit_system 

950 if visit_system is None: 

951 # Set to a value that will never match. 

952 visit_system = -1 

953 

954 # Look up each visit in the 

955 # visit_system_membership records. 

956 for rec in records: 

957 membership = list( 

958 self._registry.queryDimensionRecords( 

959 # Use bind to allow zero results. 

960 # This is a fully-specified query. 

961 "visit_system_membership", 

962 where="instrument = inst AND visit_system = system AND visit = v", 

963 bind=dict( 

964 inst=instrument_records[0].name, system=visit_system, v=rec.id 

965 ), 

966 ) 

967 ) 

968 if membership: 

969 # This record is the right answer. 

970 records = {rec} 

971 break 

972 

973 # The ambiguity may have been resolved so check again. 

974 if len(records) > 1: 

975 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

976 for r in records: 

977 log.debug("- %s", str(r)) 

978 raise ValueError( 

979 f"DataId specification for dimension {dimensionName} is not" 

980 f" uniquely constrained to a single dataset by {values}." 

981 f" Got {len(records)} results." 

982 ) 

983 else: 

984 raise ValueError( 

985 f"DataId specification for dimension {dimensionName} matched no" 

986 f" records when constrained by {values}" 

987 ) 

988 

989 # Get the primary key from the real dimension object 

990 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

991 if not isinstance(dimension, Dimension): 

992 raise RuntimeError( 

993 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

994 ) 

995 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

996 

997 return newDataId, kwargs 

998 

999 def _findDatasetRef( 

1000 self, 

1001 datasetRefOrType: DatasetRef | DatasetType | str, 

1002 dataId: DataId | None = None, 

1003 *, 

1004 collections: Any = None, 

1005 predict: bool = False, 

1006 run: str | None = None, 

1007 **kwargs: Any, 

1008 ) -> DatasetRef: 

1009 """Shared logic for methods that start with a search for a dataset in 

1010 the registry. 

1011 

1012 Parameters 

1013 ---------- 

1014 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1015 When `DatasetRef` the `dataId` should be `None`. 

1016 Otherwise the `DatasetType` or name thereof. 

1017 dataId : `dict` or `DataCoordinate`, optional 

1018 A `dict` of `Dimension` link name, value pairs that label the 

1019 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1020 should be provided as the first argument. 

1021 collections : Any, optional 

1022 Collections to be searched, overriding ``self.collections``. 

1023 Can be any of the types supported by the ``collections`` argument 

1024 to butler construction. 

1025 predict : `bool`, optional 

1026 If `True`, return a newly created `DatasetRef` with a unique 

1027 dataset ID if finding a reference in the `Registry` fails. 

1028 Defaults to `False`. 

1029 run : `str`, optional 

1030 Run collection name to use for creating `DatasetRef` for predicted 

1031 datasets. Only used if ``predict`` is `True`. 

1032 **kwargs 

1033 Additional keyword arguments used to augment or construct a 

1034 `DataId`. See `DataId` parameters. 

1035 

1036 Returns 

1037 ------- 

1038 ref : `DatasetRef` 

1039 A reference to the dataset identified by the given arguments. 

1040 This can be the same dataset reference as given if it was 

1041 resolved. 

1042 

1043 Raises 

1044 ------ 

1045 LookupError 

1046 Raised if no matching dataset exists in the `Registry` (and 

1047 ``predict`` is `False`). 

1048 ValueError 

1049 Raised if a resolved `DatasetRef` was passed as an input, but it 

1050 differs from the one found in the registry. 

1051 TypeError 

1052 Raised if no collections were provided. 

1053 """ 

1054 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1055 if isinstance(datasetRefOrType, DatasetRef): 

1056 if collections is not None: 

1057 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

1058 return datasetRefOrType 

1059 timespan: Timespan | None = None 

1060 

1061 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1062 

1063 if datasetType.isCalibration(): 

1064 # Because this is a calibration dataset, first try to make a 

1065 # standardize the data ID without restricting the dimensions to 

1066 # those of the dataset type requested, because there may be extra 

1067 # dimensions that provide temporal information for a validity-range 

1068 # lookup. 

1069 dataId = DataCoordinate.standardize( 

1070 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

1071 ) 

1072 if dataId.graph.temporal: 

1073 dataId = self._registry.expandDataId(dataId) 

1074 timespan = dataId.timespan 

1075 else: 

1076 # Standardize the data ID to just the dimensions of the dataset 

1077 # type instead of letting registry.findDataset do it, so we get the 

1078 # result even if no dataset is found. 

1079 dataId = DataCoordinate.standardize( 

1080 dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

1081 ) 

1082 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1083 # present in the current collection. 

1084 ref = self._registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1085 if ref is None: 

1086 if predict: 

1087 if run is None: 

1088 run = self.run 

1089 if run is None: 

1090 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1091 return DatasetRef(datasetType, dataId, run=run) 

1092 else: 

1093 if collections is None: 

1094 collections = self._registry.defaults.collections 

1095 raise LookupError( 

1096 f"Dataset {datasetType.name} with data ID {dataId} " 

1097 f"could not be found in collections {collections}." 

1098 ) 

1099 if datasetType != ref.datasetType: 

1100 # If they differ it is because the user explicitly specified 

1101 # a compatible dataset type to this call rather than using the 

1102 # registry definition. The DatasetRef must therefore be recreated 

1103 # using the user definition such that the expected type is 

1104 # returned. 

1105 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1106 

1107 return ref 

1108 

1109 # TODO: remove on DM-40067. 

1110 @transactional 

1111 @deprecated( 

1112 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1113 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1114 " were relying on the run parameter to determine the run." 

1115 " Will be removed after v26.0.", 

1116 version="v26.0", 

1117 category=FutureWarning, 

1118 ) 

1119 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1120 # Docstring inherited. 

1121 return self.put(obj, ref) 

1122 

1123 @transactional 

1124 def put( 

1125 self, 

1126 obj: Any, 

1127 datasetRefOrType: DatasetRef | DatasetType | str, 

1128 /, 

1129 dataId: DataId | None = None, 

1130 *, 

1131 run: str | None = None, 

1132 **kwargs: Any, 

1133 ) -> DatasetRef: 

1134 """Store and register a dataset. 

1135 

1136 Parameters 

1137 ---------- 

1138 obj : `object` 

1139 The dataset. 

1140 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1141 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1142 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1143 `DatasetRef` is given the run and ID are used directly. 

1144 dataId : `dict` or `DataCoordinate` 

1145 A `dict` of `Dimension` link name, value pairs that label the 

1146 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1147 should be provided as the second argument. 

1148 run : `str`, optional 

1149 The name of the run the dataset should be added to, overriding 

1150 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1151 **kwargs 

1152 Additional keyword arguments used to augment or construct a 

1153 `DataCoordinate`. See `DataCoordinate.standardize` 

1154 parameters. Not used if a resolve `DatasetRef` is provided. 

1155 

1156 Returns 

1157 ------- 

1158 ref : `DatasetRef` 

1159 A reference to the stored dataset, updated with the correct id if 

1160 given. 

1161 

1162 Raises 

1163 ------ 

1164 TypeError 

1165 Raised if the butler is read-only or if no run has been provided. 

1166 """ 

1167 if isinstance(datasetRefOrType, DatasetRef): 

1168 # This is a direct put of predefined DatasetRef. 

1169 log.debug("Butler put direct: %s", datasetRefOrType) 

1170 if run is not None: 

1171 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

1172 # If registry already has a dataset with the same dataset ID, 

1173 # dataset type and DataId, then _importDatasets will do nothing and 

1174 # just return an original ref. We have to raise in this case, there 

1175 # is a datastore check below for that. 

1176 self._registry._importDatasets([datasetRefOrType], expand=True) 

1177 # Before trying to write to the datastore check that it does not 

1178 # know this dataset. This is prone to races, of course. 

1179 if self._datastore.knows(datasetRefOrType): 

1180 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1181 # Try to write dataset to the datastore, if it fails due to a race 

1182 # with another write, the content of stored data may be 

1183 # unpredictable. 

1184 try: 

1185 self._datastore.put(obj, datasetRefOrType) 

1186 except IntegrityError as e: 

1187 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

1188 return datasetRefOrType 

1189 

1190 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1191 if not self.isWriteable(): 

1192 raise TypeError("Butler is read-only.") 

1193 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1194 

1195 # Handle dimension records in dataId 

1196 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1197 

1198 # Add Registry Dataset entry. 

1199 dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1200 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1201 self._datastore.put(obj, ref) 

1202 

1203 return ref 

1204 

1205 # TODO: remove on DM-40067. 

1206 @deprecated( 

1207 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1208 " Please use Butler.get(). Will be removed after v26.0.", 

1209 version="v26.0", 

1210 category=FutureWarning, 

1211 ) 

1212 def getDirect( 

1213 self, 

1214 ref: DatasetRef, 

1215 *, 

1216 parameters: dict[str, Any] | None = None, 

1217 storageClass: StorageClass | str | None = None, 

1218 ) -> Any: 

1219 """Retrieve a stored dataset. 

1220 

1221 Parameters 

1222 ---------- 

1223 ref : `DatasetRef` 

1224 Resolved reference to an already stored dataset. 

1225 parameters : `dict` 

1226 Additional StorageClass-defined options to control reading, 

1227 typically used to efficiently read only a subset of the dataset. 

1228 storageClass : `StorageClass` or `str`, optional 

1229 The storage class to be used to override the Python type 

1230 returned by this method. By default the returned type matches 

1231 the dataset type definition for this dataset. Specifying a 

1232 read `StorageClass` can force a different type to be returned. 

1233 This type must be compatible with the original type. 

1234 

1235 Returns 

1236 ------- 

1237 obj : `object` 

1238 The dataset. 

1239 """ 

1240 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1241 

1242 # TODO: remove on DM-40067. 

1243 @deprecated( 

1244 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1245 "Please use Butler.getDeferred(). Will be removed after v26.0.", 

1246 version="v26.0", 

1247 category=FutureWarning, 

1248 ) 

1249 def getDirectDeferred( 

1250 self, 

1251 ref: DatasetRef, 

1252 *, 

1253 parameters: dict | None = None, 

1254 storageClass: str | StorageClass | None = None, 

1255 ) -> DeferredDatasetHandle: 

1256 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1257 from a resolved `DatasetRef`. 

1258 

1259 Parameters 

1260 ---------- 

1261 ref : `DatasetRef` 

1262 Resolved reference to an already stored dataset. 

1263 parameters : `dict` 

1264 Additional StorageClass-defined options to control reading, 

1265 typically used to efficiently read only a subset of the dataset. 

1266 storageClass : `StorageClass` or `str`, optional 

1267 The storage class to be used to override the Python type 

1268 returned by this method. By default the returned type matches 

1269 the dataset type definition for this dataset. Specifying a 

1270 read `StorageClass` can force a different type to be returned. 

1271 This type must be compatible with the original type. 

1272 

1273 Returns 

1274 ------- 

1275 obj : `DeferredDatasetHandle` 

1276 A handle which can be used to retrieve a dataset at a later time. 

1277 

1278 Raises 

1279 ------ 

1280 LookupError 

1281 Raised if no matching dataset exists in the `Registry`. 

1282 """ 

1283 # Check that dataset is known to the datastore. 

1284 if not self._datastore.knows(ref): 

1285 raise LookupError(f"Dataset reference {ref} is not known to datastore.") 

1286 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1287 

1288 def getDeferred( 

1289 self, 

1290 datasetRefOrType: DatasetRef | DatasetType | str, 

1291 /, 

1292 dataId: DataId | None = None, 

1293 *, 

1294 parameters: dict | None = None, 

1295 collections: Any = None, 

1296 storageClass: str | StorageClass | None = None, 

1297 **kwargs: Any, 

1298 ) -> DeferredDatasetHandle: 

1299 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1300 after an immediate registry lookup. 

1301 

1302 Parameters 

1303 ---------- 

1304 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1305 When `DatasetRef` the `dataId` should be `None`. 

1306 Otherwise the `DatasetType` or name thereof. 

1307 dataId : `dict` or `DataCoordinate`, optional 

1308 A `dict` of `Dimension` link name, value pairs that label the 

1309 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1310 should be provided as the first argument. 

1311 parameters : `dict` 

1312 Additional StorageClass-defined options to control reading, 

1313 typically used to efficiently read only a subset of the dataset. 

1314 collections : Any, optional 

1315 Collections to be searched, overriding ``self.collections``. 

1316 Can be any of the types supported by the ``collections`` argument 

1317 to butler construction. 

1318 storageClass : `StorageClass` or `str`, optional 

1319 The storage class to be used to override the Python type 

1320 returned by this method. By default the returned type matches 

1321 the dataset type definition for this dataset. Specifying a 

1322 read `StorageClass` can force a different type to be returned. 

1323 This type must be compatible with the original type. 

1324 **kwargs 

1325 Additional keyword arguments used to augment or construct a 

1326 `DataId`. See `DataId` parameters. 

1327 

1328 Returns 

1329 ------- 

1330 obj : `DeferredDatasetHandle` 

1331 A handle which can be used to retrieve a dataset at a later time. 

1332 

1333 Raises 

1334 ------ 

1335 LookupError 

1336 Raised if no matching dataset exists in the `Registry` or 

1337 datastore. 

1338 ValueError 

1339 Raised if a resolved `DatasetRef` was passed as an input, but it 

1340 differs from the one found in the registry. 

1341 TypeError 

1342 Raised if no collections were provided. 

1343 """ 

1344 if isinstance(datasetRefOrType, DatasetRef): 

1345 # Do the quick check first and if that fails, check for artifact 

1346 # existence. This is necessary for datastores that are configured 

1347 # in trust mode where there won't be a record but there will be 

1348 # a file. 

1349 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1350 ref = datasetRefOrType 

1351 else: 

1352 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1353 else: 

1354 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1355 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1356 

1357 def get( 

1358 self, 

1359 datasetRefOrType: DatasetRef | DatasetType | str, 

1360 /, 

1361 dataId: DataId | None = None, 

1362 *, 

1363 parameters: dict[str, Any] | None = None, 

1364 collections: Any = None, 

1365 storageClass: StorageClass | str | None = None, 

1366 **kwargs: Any, 

1367 ) -> Any: 

1368 """Retrieve a stored dataset. 

1369 

1370 Parameters 

1371 ---------- 

1372 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1373 When `DatasetRef` the `dataId` should be `None`. 

1374 Otherwise the `DatasetType` or name thereof. 

1375 If a resolved `DatasetRef`, the associated dataset 

1376 is returned directly without additional querying. 

1377 dataId : `dict` or `DataCoordinate` 

1378 A `dict` of `Dimension` link name, value pairs that label the 

1379 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1380 should be provided as the first argument. 

1381 parameters : `dict` 

1382 Additional StorageClass-defined options to control reading, 

1383 typically used to efficiently read only a subset of the dataset. 

1384 collections : Any, optional 

1385 Collections to be searched, overriding ``self.collections``. 

1386 Can be any of the types supported by the ``collections`` argument 

1387 to butler construction. 

1388 storageClass : `StorageClass` or `str`, optional 

1389 The storage class to be used to override the Python type 

1390 returned by this method. By default the returned type matches 

1391 the dataset type definition for this dataset. Specifying a 

1392 read `StorageClass` can force a different type to be returned. 

1393 This type must be compatible with the original type. 

1394 **kwargs 

1395 Additional keyword arguments used to augment or construct a 

1396 `DataCoordinate`. See `DataCoordinate.standardize` 

1397 parameters. 

1398 

1399 Returns 

1400 ------- 

1401 obj : `object` 

1402 The dataset. 

1403 

1404 Raises 

1405 ------ 

1406 LookupError 

1407 Raised if no matching dataset exists in the `Registry`. 

1408 TypeError 

1409 Raised if no collections were provided. 

1410 

1411 Notes 

1412 ----- 

1413 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1414 this method requires that the given data ID include temporal dimensions 

1415 beyond the dimensions of the dataset type itself, in order to find the 

1416 dataset with the appropriate validity range. For example, a "bias" 

1417 dataset with native dimensions ``{instrument, detector}`` could be 

1418 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1419 ``exposure`` is a temporal dimension. 

1420 """ 

1421 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1422 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1423 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1424 

1425 def getURIs( 

1426 self, 

1427 datasetRefOrType: DatasetRef | DatasetType | str, 

1428 /, 

1429 dataId: DataId | None = None, 

1430 *, 

1431 predict: bool = False, 

1432 collections: Any = None, 

1433 run: str | None = None, 

1434 **kwargs: Any, 

1435 ) -> DatasetRefURIs: 

1436 """Return the URIs associated with the dataset. 

1437 

1438 Parameters 

1439 ---------- 

1440 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1441 When `DatasetRef` the `dataId` should be `None`. 

1442 Otherwise the `DatasetType` or name thereof. 

1443 dataId : `dict` or `DataCoordinate` 

1444 A `dict` of `Dimension` link name, value pairs that label the 

1445 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1446 should be provided as the first argument. 

1447 predict : `bool` 

1448 If `True`, allow URIs to be returned of datasets that have not 

1449 been written. 

1450 collections : Any, optional 

1451 Collections to be searched, overriding ``self.collections``. 

1452 Can be any of the types supported by the ``collections`` argument 

1453 to butler construction. 

1454 run : `str`, optional 

1455 Run to use for predictions, overriding ``self.run``. 

1456 **kwargs 

1457 Additional keyword arguments used to augment or construct a 

1458 `DataCoordinate`. See `DataCoordinate.standardize` 

1459 parameters. 

1460 

1461 Returns 

1462 ------- 

1463 uris : `DatasetRefURIs` 

1464 The URI to the primary artifact associated with this dataset (if 

1465 the dataset was disassembled within the datastore this may be 

1466 `None`), and the URIs to any components associated with the dataset 

1467 artifact. (can be empty if there are no components). 

1468 """ 

1469 ref = self._findDatasetRef( 

1470 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1471 ) 

1472 return self._datastore.getURIs(ref, predict) 

1473 

1474 def getURI( 

1475 self, 

1476 datasetRefOrType: DatasetRef | DatasetType | str, 

1477 /, 

1478 dataId: DataId | None = None, 

1479 *, 

1480 predict: bool = False, 

1481 collections: Any = None, 

1482 run: str | None = None, 

1483 **kwargs: Any, 

1484 ) -> ResourcePath: 

1485 """Return the URI to the Dataset. 

1486 

1487 Parameters 

1488 ---------- 

1489 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1490 When `DatasetRef` the `dataId` should be `None`. 

1491 Otherwise the `DatasetType` or name thereof. 

1492 dataId : `dict` or `DataCoordinate` 

1493 A `dict` of `Dimension` link name, value pairs that label the 

1494 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1495 should be provided as the first argument. 

1496 predict : `bool` 

1497 If `True`, allow URIs to be returned of datasets that have not 

1498 been written. 

1499 collections : Any, optional 

1500 Collections to be searched, overriding ``self.collections``. 

1501 Can be any of the types supported by the ``collections`` argument 

1502 to butler construction. 

1503 run : `str`, optional 

1504 Run to use for predictions, overriding ``self.run``. 

1505 **kwargs 

1506 Additional keyword arguments used to augment or construct a 

1507 `DataCoordinate`. See `DataCoordinate.standardize` 

1508 parameters. 

1509 

1510 Returns 

1511 ------- 

1512 uri : `lsst.resources.ResourcePath` 

1513 URI pointing to the Dataset within the datastore. If the 

1514 Dataset does not exist in the datastore, and if ``predict`` is 

1515 `True`, the URI will be a prediction and will include a URI 

1516 fragment "#predicted". 

1517 If the datastore does not have entities that relate well 

1518 to the concept of a URI the returned URI string will be 

1519 descriptive. The returned URI is not guaranteed to be obtainable. 

1520 

1521 Raises 

1522 ------ 

1523 LookupError 

1524 A URI has been requested for a dataset that does not exist and 

1525 guessing is not allowed. 

1526 ValueError 

1527 Raised if a resolved `DatasetRef` was passed as an input, but it 

1528 differs from the one found in the registry. 

1529 TypeError 

1530 Raised if no collections were provided. 

1531 RuntimeError 

1532 Raised if a URI is requested for a dataset that consists of 

1533 multiple artifacts. 

1534 """ 

1535 primary, components = self.getURIs( 

1536 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1537 ) 

1538 

1539 if primary is None or components: 

1540 raise RuntimeError( 

1541 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1542 "Use Butler.getURIs() instead." 

1543 ) 

1544 return primary 

1545 

1546 def retrieveArtifacts( 

1547 self, 

1548 refs: Iterable[DatasetRef], 

1549 destination: ResourcePathExpression, 

1550 transfer: str = "auto", 

1551 preserve_path: bool = True, 

1552 overwrite: bool = False, 

1553 ) -> list[ResourcePath]: 

1554 """Retrieve the artifacts associated with the supplied refs. 

1555 

1556 Parameters 

1557 ---------- 

1558 refs : iterable of `DatasetRef` 

1559 The datasets for which artifacts are to be retrieved. 

1560 A single ref can result in multiple artifacts. The refs must 

1561 be resolved. 

1562 destination : `lsst.resources.ResourcePath` or `str` 

1563 Location to write the artifacts. 

1564 transfer : `str`, optional 

1565 Method to use to transfer the artifacts. Must be one of the options 

1566 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1567 "move" is not allowed. 

1568 preserve_path : `bool`, optional 

1569 If `True` the full path of the artifact within the datastore 

1570 is preserved. If `False` the final file component of the path 

1571 is used. 

1572 overwrite : `bool`, optional 

1573 If `True` allow transfers to overwrite existing files at the 

1574 destination. 

1575 

1576 Returns 

1577 ------- 

1578 targets : `list` of `lsst.resources.ResourcePath` 

1579 URIs of file artifacts in destination location. Order is not 

1580 preserved. 

1581 

1582 Notes 

1583 ----- 

1584 For non-file datastores the artifacts written to the destination 

1585 may not match the representation inside the datastore. For example 

1586 a hierarchical data structure in a NoSQL database may well be stored 

1587 as a JSON file. 

1588 """ 

1589 return self._datastore.retrieveArtifacts( 

1590 refs, 

1591 ResourcePath(destination), 

1592 transfer=transfer, 

1593 preserve_path=preserve_path, 

1594 overwrite=overwrite, 

1595 ) 

1596 

1597 def exists( 

1598 self, 

1599 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1600 /, 

1601 data_id: DataId | None = None, 

1602 *, 

1603 full_check: bool = True, 

1604 collections: Any = None, 

1605 **kwargs: Any, 

1606 ) -> DatasetExistence: 

1607 """Indicate whether a dataset is known to Butler registry and 

1608 datastore. 

1609 

1610 Parameters 

1611 ---------- 

1612 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str` 

1613 When `DatasetRef` the `dataId` should be `None`. 

1614 Otherwise the `DatasetType` or name thereof. 

1615 data_id : `dict` or `DataCoordinate` 

1616 A `dict` of `Dimension` link name, value pairs that label the 

1617 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1618 should be provided as the first argument. 

1619 full_check : `bool`, optional 

1620 If `True`, an additional check will be made for dataset artifact 

1621 existence. This will involve additional overhead due to the need 

1622 to query an external system. If `False` registry and datastore 

1623 will solely be asked if they know about the dataset but no 

1624 check for the artifact will be performed. 

1625 collections : Any, optional 

1626 Collections to be searched, overriding ``self.collections``. 

1627 Can be any of the types supported by the ``collections`` argument 

1628 to butler construction. 

1629 **kwargs 

1630 Additional keyword arguments used to augment or construct a 

1631 `DataCoordinate`. See `DataCoordinate.standardize` 

1632 parameters. 

1633 

1634 Returns 

1635 ------- 

1636 existence : `DatasetExistence` 

1637 Object indicating whether the dataset is known to registry and 

1638 datastore. Evaluates to `True` if the dataset is present and known 

1639 to both. 

1640 """ 

1641 existence = DatasetExistence.UNRECOGNIZED 

1642 

1643 if isinstance(dataset_ref_or_type, DatasetRef): 

1644 if collections is not None: 

1645 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1646 if data_id is not None: 

1647 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1648 ref = dataset_ref_or_type 

1649 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1650 if registry_ref is not None: 

1651 existence |= DatasetExistence.RECORDED 

1652 

1653 if dataset_ref_or_type != registry_ref: 

1654 # This could mean that storage classes differ, so we should 

1655 # check for that but use the registry ref for the rest of 

1656 # the method. 

1657 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1658 # Use the registry version from now on. 

1659 ref = registry_ref 

1660 else: 

1661 raise ValueError( 

1662 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1663 f"in registry but has different incompatible values ({registry_ref})." 

1664 ) 

1665 else: 

1666 try: 

1667 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1668 except (LookupError, TypeError, NoDefaultCollectionError): 

1669 return existence 

1670 existence |= DatasetExistence.RECORDED 

1671 

1672 if self._datastore.knows(ref): 

1673 existence |= DatasetExistence.DATASTORE 

1674 

1675 if full_check: 

1676 if self._datastore.exists(ref): 

1677 existence |= DatasetExistence._ARTIFACT 

1678 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1679 # Do not add this flag if we have no other idea about a dataset. 

1680 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1681 

1682 return existence 

1683 

1684 def _exists_many( 

1685 self, 

1686 refs: Iterable[DatasetRef], 

1687 /, 

1688 *, 

1689 full_check: bool = True, 

1690 ) -> dict[DatasetRef, DatasetExistence]: 

1691 """Indicate whether multiple datasets are known to Butler registry and 

1692 datastore. 

1693 

1694 This is an experimental API that may change at any moment. 

1695 

1696 Parameters 

1697 ---------- 

1698 refs : iterable of `DatasetRef` 

1699 The datasets to be checked. 

1700 full_check : `bool`, optional 

1701 If `True`, an additional check will be made for dataset artifact 

1702 existence. This will involve additional overhead due to the need 

1703 to query an external system. If `False` registry and datastore 

1704 will solely be asked if they know about the dataset but no 

1705 check for the artifact will be performed. 

1706 

1707 Returns 

1708 ------- 

1709 existence : dict of [`DatasetRef`, `DatasetExistence`] 

1710 Mapping from the given dataset refs to an enum indicating the 

1711 status of the dataset in registry and datastore. 

1712 Each value evaluates to `True` if the dataset is present and known 

1713 to both. 

1714 """ 

1715 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1716 

1717 # Registry does not have a bulk API to check for a ref. 

1718 for ref in refs: 

1719 registry_ref = self._registry.getDataset(ref.id) 

1720 if registry_ref is not None: 

1721 # It is possible, albeit unlikely, that the given ref does 

1722 # not match the one in registry even though the UUID matches. 

1723 # When checking a single ref we raise, but it's impolite to 

1724 # do that when potentially hundreds of refs are being checked. 

1725 # We could change the API to only accept UUIDs and that would 

1726 # remove the ability to even check and remove the worry 

1727 # about differing storage classes. Given the ongoing discussion 

1728 # on refs vs UUIDs and whether to raise or have a new 

1729 # private flag, treat this as a private API for now. 

1730 existence[ref] |= DatasetExistence.RECORDED 

1731 

1732 # Ask datastore if it knows about these refs. 

1733 knows = self._datastore.knows_these(refs) 

1734 for ref, known in knows.items(): 

1735 if known: 

1736 existence[ref] |= DatasetExistence.DATASTORE 

1737 

1738 if full_check: 

1739 mexists = self._datastore.mexists(refs) 

1740 for ref, exists in mexists.items(): 

1741 if exists: 

1742 existence[ref] |= DatasetExistence._ARTIFACT 

1743 else: 

1744 # Do not set this flag if nothing is known about the dataset. 

1745 for ref in existence: 

1746 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1747 existence[ref] |= DatasetExistence._ASSUMED 

1748 

1749 return existence 

1750 

1751 # TODO: remove on DM-40079. 

1752 @deprecated( 

1753 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.", 

1754 version="v26.0", 

1755 category=FutureWarning, 

1756 ) 

1757 def datasetExists( 

1758 self, 

1759 datasetRefOrType: DatasetRef | DatasetType | str, 

1760 dataId: DataId | None = None, 

1761 *, 

1762 collections: Any = None, 

1763 **kwargs: Any, 

1764 ) -> bool: 

1765 """Return True if the Dataset is actually present in the Datastore. 

1766 

1767 Parameters 

1768 ---------- 

1769 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1770 When `DatasetRef` the `dataId` should be `None`. 

1771 Otherwise the `DatasetType` or name thereof. 

1772 dataId : `dict` or `DataCoordinate` 

1773 A `dict` of `Dimension` link name, value pairs that label the 

1774 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1775 should be provided as the first argument. 

1776 collections : Any, optional 

1777 Collections to be searched, overriding ``self.collections``. 

1778 Can be any of the types supported by the ``collections`` argument 

1779 to butler construction. 

1780 **kwargs 

1781 Additional keyword arguments used to augment or construct a 

1782 `DataCoordinate`. See `DataCoordinate.standardize` 

1783 parameters. 

1784 

1785 Raises 

1786 ------ 

1787 LookupError 

1788 Raised if the dataset is not even present in the Registry. 

1789 ValueError 

1790 Raised if a resolved `DatasetRef` was passed as an input, but it 

1791 differs from the one found in the registry. 

1792 NoDefaultCollectionError 

1793 Raised if no collections were provided. 

1794 """ 

1795 # A resolved ref may be given that is not known to this butler. 

1796 if isinstance(datasetRefOrType, DatasetRef): 

1797 ref = self._registry.getDataset(datasetRefOrType.id) 

1798 if ref is None: 

1799 raise LookupError( 

1800 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1801 ) 

1802 else: 

1803 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1804 return self._datastore.exists(ref) 

1805 

1806 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1807 """Remove one or more `~CollectionType.RUN` collections and the 

1808 datasets within them. 

1809 

1810 Parameters 

1811 ---------- 

1812 names : `~collections.abc.Iterable` [ `str` ] 

1813 The names of the collections to remove. 

1814 unstore : `bool`, optional 

1815 If `True` (default), delete datasets from all datastores in which 

1816 they are present, and attempt to rollback the registry deletions if 

1817 datastore deletions fail (which may not always be possible). If 

1818 `False`, datastore records for these datasets are still removed, 

1819 but any artifacts (e.g. files) will not be. 

1820 

1821 Raises 

1822 ------ 

1823 TypeError 

1824 Raised if one or more collections are not of type 

1825 `~CollectionType.RUN`. 

1826 """ 

1827 if not self.isWriteable(): 

1828 raise TypeError("Butler is read-only.") 

1829 names = list(names) 

1830 refs: list[DatasetRef] = [] 

1831 for name in names: 

1832 collectionType = self._registry.getCollectionType(name) 

1833 if collectionType is not CollectionType.RUN: 

1834 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1835 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1836 with self._datastore.transaction(), self._registry.transaction(): 

1837 if unstore: 

1838 self._datastore.trash(refs) 

1839 else: 

1840 self._datastore.forget(refs) 

1841 for name in names: 

1842 self._registry.removeCollection(name) 

1843 if unstore: 

1844 # Point of no return for removing artifacts 

1845 self._datastore.emptyTrash() 

1846 

1847 def pruneDatasets( 

1848 self, 

1849 refs: Iterable[DatasetRef], 

1850 *, 

1851 disassociate: bool = True, 

1852 unstore: bool = False, 

1853 tags: Iterable[str] = (), 

1854 purge: bool = False, 

1855 ) -> None: 

1856 # docstring inherited from LimitedButler 

1857 

1858 if not self.isWriteable(): 

1859 raise TypeError("Butler is read-only.") 

1860 if purge: 

1861 if not disassociate: 

1862 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1863 if not unstore: 

1864 raise TypeError("Cannot pass purge=True without unstore=True.") 

1865 elif disassociate: 

1866 tags = tuple(tags) 

1867 if not tags: 

1868 raise TypeError("No tags provided but disassociate=True.") 

1869 for tag in tags: 

1870 collectionType = self._registry.getCollectionType(tag) 

1871 if collectionType is not CollectionType.TAGGED: 

1872 raise TypeError( 

1873 f"Cannot disassociate from collection '{tag}' " 

1874 f"of non-TAGGED type {collectionType.name}." 

1875 ) 

1876 # Transform possibly-single-pass iterable into something we can iterate 

1877 # over multiple times. 

1878 refs = list(refs) 

1879 # Pruning a component of a DatasetRef makes no sense since registry 

1880 # doesn't know about components and datastore might not store 

1881 # components in a separate file 

1882 for ref in refs: 

1883 if ref.datasetType.component(): 

1884 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1885 # We don't need an unreliable Datastore transaction for this, because 

1886 # we've been extra careful to ensure that Datastore.trash only involves 

1887 # mutating the Registry (it can _look_ at Datastore-specific things, 

1888 # but shouldn't change them), and hence all operations here are 

1889 # Registry operations. 

1890 with self._datastore.transaction(), self._registry.transaction(): 

1891 if unstore: 

1892 self._datastore.trash(refs) 

1893 if purge: 

1894 self._registry.removeDatasets(refs) 

1895 elif disassociate: 

1896 assert tags, "Guaranteed by earlier logic in this function." 

1897 for tag in tags: 

1898 self._registry.disassociate(tag, refs) 

1899 # We've exited the Registry transaction, and apparently committed. 

1900 # (if there was an exception, everything rolled back, and it's as if 

1901 # nothing happened - and we never get here). 

1902 # Datastore artifacts are not yet gone, but they're clearly marked 

1903 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1904 # problems we can try again later, and if manual administrative 

1905 # intervention is required, it's pretty clear what that should entail: 

1906 # deleting everything on disk and in private Datastore tables that is 

1907 # in the dataset_location_trash table. 

1908 if unstore: 

1909 # Point of no return for removing artifacts 

1910 self._datastore.emptyTrash() 

1911 

1912 @transactional 

1913 def ingest( 

1914 self, 

1915 *datasets: FileDataset, 

1916 transfer: str | None = "auto", 

1917 run: str | None = None, 

1918 idGenerationMode: DatasetIdGenEnum | None = None, 

1919 record_validation_info: bool = True, 

1920 ) -> None: 

1921 """Store and register one or more datasets that already exist on disk. 

1922 

1923 Parameters 

1924 ---------- 

1925 datasets : `FileDataset` 

1926 Each positional argument is a struct containing information about 

1927 a file to be ingested, including its URI (either absolute or 

1928 relative to the datastore root, if applicable), a resolved 

1929 `DatasetRef`, and optionally a formatter class or its 

1930 fully-qualified string name. If a formatter is not provided, the 

1931 formatter that would be used for `put` is assumed. On successful 

1932 ingest all `FileDataset.formatter` attributes will be set to the 

1933 formatter class used. `FileDataset.path` attributes may be modified 

1934 to put paths in whatever the datastore considers a standardized 

1935 form. 

1936 transfer : `str`, optional 

1937 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1938 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1939 transfer the file. 

1940 run : `str`, optional 

1941 The name of the run ingested datasets should be added to, 

1942 overriding ``self.run``. This parameter is now deprecated since 

1943 the run is encoded in the ``FileDataset``. 

1944 idGenerationMode : `DatasetIdGenEnum`, optional 

1945 Specifies option for generating dataset IDs. Parameter is 

1946 deprecated. 

1947 record_validation_info : `bool`, optional 

1948 If `True`, the default, the datastore can record validation 

1949 information associated with the file. If `False` the datastore 

1950 will not attempt to track any information such as checksums 

1951 or file sizes. This can be useful if such information is tracked 

1952 in an external system or if the file is to be compressed in place. 

1953 It is up to the datastore whether this parameter is relevant. 

1954 

1955 Raises 

1956 ------ 

1957 TypeError 

1958 Raised if the butler is read-only or if no run was provided. 

1959 NotImplementedError 

1960 Raised if the `Datastore` does not support the given transfer mode. 

1961 DatasetTypeNotSupportedError 

1962 Raised if one or more files to be ingested have a dataset type that 

1963 is not supported by the `Datastore`.. 

1964 FileNotFoundError 

1965 Raised if one of the given files does not exist. 

1966 FileExistsError 

1967 Raised if transfer is not `None` but the (internal) location the 

1968 file would be moved to is already occupied. 

1969 

1970 Notes 

1971 ----- 

1972 This operation is not fully exception safe: if a database operation 

1973 fails, the given `FileDataset` instances may be only partially updated. 

1974 

1975 It is atomic in terms of database operations (they will either all 

1976 succeed or all fail) providing the database engine implements 

1977 transactions correctly. It will attempt to be atomic in terms of 

1978 filesystem operations as well, but this cannot be implemented 

1979 rigorously for most datastores. 

1980 """ 

1981 if not self.isWriteable(): 

1982 raise TypeError("Butler is read-only.") 

1983 

1984 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1985 if not datasets: 

1986 return 

1987 

1988 if idGenerationMode is not None: 

1989 warnings.warn( 

1990 "The idGenerationMode parameter is no longer used and is ignored. " 

1991 " Will be removed after v26.0", 

1992 FutureWarning, 

1993 stacklevel=2, 

1994 ) 

1995 

1996 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1997 

1998 # We need to reorganize all the inputs so that they are grouped 

1999 # by dataset type and run. Multiple refs in a single FileDataset 

2000 # are required to share the run and dataset type. 

2001 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

2002 groupedData: GroupedData = defaultdict(list) 

2003 

2004 # Track DataIDs that are being ingested so we can spot issues early 

2005 # with duplication. Retain previous FileDataset so we can report it. 

2006 groupedDataIds: MutableMapping[ 

2007 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

2008 ] = defaultdict(dict) 

2009 

2010 used_run = False 

2011 

2012 # And the nested loop that populates it: 

2013 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

2014 # Somewhere to store pre-existing refs if we have an 

2015 # execution butler. 

2016 existingRefs: list[DatasetRef] = [] 

2017 

2018 for ref in dataset.refs: 

2019 assert ref.run is not None # For mypy 

2020 group_key = (ref.datasetType, ref.run) 

2021 

2022 if ref.dataId in groupedDataIds[group_key]: 

2023 raise ConflictingDefinitionError( 

2024 f"Ingest conflict. Dataset {dataset.path} has same" 

2025 " DataId as other ingest dataset" 

2026 f" {groupedDataIds[group_key][ref.dataId].path} " 

2027 f" ({ref.dataId})" 

2028 ) 

2029 

2030 groupedDataIds[group_key][ref.dataId] = dataset 

2031 

2032 if existingRefs: 

2033 if len(dataset.refs) != len(existingRefs): 

2034 # Keeping track of partially pre-existing datasets is hard 

2035 # and should generally never happen. For now don't allow 

2036 # it. 

2037 raise ConflictingDefinitionError( 

2038 f"For dataset {dataset.path} some dataIds already exist" 

2039 " in registry but others do not. This is not supported." 

2040 ) 

2041 

2042 # Store expanded form in the original FileDataset. 

2043 dataset.refs = existingRefs 

2044 else: 

2045 groupedData[group_key].append(dataset) 

2046 

2047 if not used_run and run is not None: 

2048 warnings.warn( 

2049 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

2050 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

2051 category=FutureWarning, 

2052 stacklevel=3, # Take into account the @transactional decorator. 

2053 ) 

2054 

2055 # Now we can bulk-insert into Registry for each DatasetType. 

2056 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

2057 groupedData.items(), desc="Bulk-inserting datasets by type" 

2058 ): 

2059 refs_to_import = [] 

2060 for dataset in grouped_datasets: 

2061 refs_to_import.extend(dataset.refs) 

2062 

2063 n_refs = len(refs_to_import) 

2064 log.verbose( 

2065 "Importing %d ref%s of dataset type %r into run %r", 

2066 n_refs, 

2067 "" if n_refs == 1 else "s", 

2068 datasetType.name, 

2069 this_run, 

2070 ) 

2071 

2072 # Import the refs and expand the DataCoordinates since we can't 

2073 # guarantee that they are expanded and Datastore will need 

2074 # the records. 

2075 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

2076 assert set(imported_refs) == set(refs_to_import) 

2077 

2078 # Replace all the refs in the FileDataset with expanded versions. 

2079 # Pull them off in the order we put them on the list. 

2080 for dataset in grouped_datasets: 

2081 n_dataset_refs = len(dataset.refs) 

2082 dataset.refs = imported_refs[:n_dataset_refs] 

2083 del imported_refs[:n_dataset_refs] 

2084 

2085 # Bulk-insert everything into Datastore. 

2086 # We do not know if any of the registry entries already existed 

2087 # (_importDatasets only complains if they exist but differ) so 

2088 # we have to catch IntegrityError explicitly. 

2089 try: 

2090 self._datastore.ingest( 

2091 *datasets, transfer=transfer, record_validation_info=record_validation_info 

2092 ) 

2093 except IntegrityError as e: 

2094 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

2095 

2096 @contextlib.contextmanager 

2097 def export( 

2098 self, 

2099 *, 

2100 directory: str | None = None, 

2101 filename: str | None = None, 

2102 format: str | None = None, 

2103 transfer: str | None = None, 

2104 ) -> Iterator[RepoExportContext]: 

2105 """Export datasets from the repository represented by this `Butler`. 

2106 

2107 This method is a context manager that returns a helper object 

2108 (`RepoExportContext`) that is used to indicate what information from 

2109 the repository should be exported. 

2110 

2111 Parameters 

2112 ---------- 

2113 directory : `str`, optional 

2114 Directory dataset files should be written to if ``transfer`` is not 

2115 `None`. 

2116 filename : `str`, optional 

2117 Name for the file that will include database information associated 

2118 with the exported datasets. If this is not an absolute path and 

2119 ``directory`` is not `None`, it will be written to ``directory`` 

2120 instead of the current working directory. Defaults to 

2121 "export.{format}". 

2122 format : `str`, optional 

2123 File format for the database information file. If `None`, the 

2124 extension of ``filename`` will be used. 

2125 transfer : `str`, optional 

2126 Transfer mode passed to `Datastore.export`. 

2127 

2128 Raises 

2129 ------ 

2130 TypeError 

2131 Raised if the set of arguments passed is inconsistent. 

2132 

2133 Examples 

2134 -------- 

2135 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2136 methods are used to provide the iterables over data IDs and/or datasets 

2137 to be exported:: 

2138 

2139 with butler.export("exports.yaml") as export: 

2140 # Export all flats, but none of the dimension element rows 

2141 # (i.e. data ID information) associated with them. 

2142 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2143 elements=()) 

2144 # Export all datasets that start with "deepCoadd_" and all of 

2145 # their associated data ID information. 

2146 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2147 """ 

2148 if directory is None and transfer is not None: 

2149 raise TypeError("Cannot transfer without providing a directory.") 

2150 if transfer == "move": 

2151 raise TypeError("Transfer may not be 'move': export is read-only") 

2152 if format is None: 

2153 if filename is None: 

2154 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2155 else: 

2156 _, format = os.path.splitext(filename) 

2157 if not format: 

2158 raise ValueError("Please specify a file extension to determine export format.") 

2159 format = format[1:] # Strip leading "."" 

2160 elif filename is None: 

2161 filename = f"export.{format}" 

2162 if directory is not None: 

2163 filename = os.path.join(directory, filename) 

2164 formats = self._config["repo_transfer_formats"] 

2165 if format not in formats: 

2166 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2167 BackendClass = get_class_of(formats[format, "export"]) 

2168 with open(filename, "w") as stream: 

2169 backend = BackendClass(stream, universe=self.dimensions) 

2170 try: 

2171 helper = RepoExportContext( 

2172 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

2173 ) 

2174 yield helper 

2175 except BaseException: 

2176 raise 

2177 else: 

2178 helper._finish() 

2179 

2180 def import_( 

2181 self, 

2182 *, 

2183 directory: ResourcePathExpression | None = None, 

2184 filename: ResourcePathExpression | TextIO | None = None, 

2185 format: str | None = None, 

2186 transfer: str | None = None, 

2187 skip_dimensions: set | None = None, 

2188 ) -> None: 

2189 """Import datasets into this repository that were exported from a 

2190 different butler repository via `~lsst.daf.butler.Butler.export`. 

2191 

2192 Parameters 

2193 ---------- 

2194 directory : `~lsst.resources.ResourcePathExpression`, optional 

2195 Directory containing dataset files to import from. If `None`, 

2196 ``filename`` and all dataset file paths specified therein must 

2197 be absolute. 

2198 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2199 A stream or name of file that contains database information 

2200 associated with the exported datasets, typically generated by 

2201 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2202 `~lsst.resources.ResourcePath` and is not an absolute path, 

2203 it will first be looked for relative to ``directory`` and if not 

2204 found there it will be looked for in the current working 

2205 directory. Defaults to "export.{format}". 

2206 format : `str`, optional 

2207 File format for ``filename``. If `None`, the extension of 

2208 ``filename`` will be used. 

2209 transfer : `str`, optional 

2210 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2211 skip_dimensions : `set`, optional 

2212 Names of dimensions that should be skipped and not imported. 

2213 

2214 Raises 

2215 ------ 

2216 TypeError 

2217 Raised if the set of arguments passed is inconsistent, or if the 

2218 butler is read-only. 

2219 """ 

2220 if not self.isWriteable(): 

2221 raise TypeError("Butler is read-only.") 

2222 if format is None: 

2223 if filename is None: 

2224 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2225 else: 

2226 _, format = os.path.splitext(filename) # type: ignore 

2227 elif filename is None: 

2228 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2229 if directory is not None: 

2230 directory = ResourcePath(directory, forceDirectory=True) 

2231 # mypy doesn't think this will work but it does in python >= 3.10. 

2232 if isinstance(filename, ResourcePathExpression): # type: ignore 

2233 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2234 if not filename.isabs() and directory is not None: 

2235 potential = directory.join(filename) 

2236 exists_in_cwd = filename.exists() 

2237 exists_in_dir = potential.exists() 

2238 if exists_in_cwd and exists_in_dir: 

2239 log.warning( 

2240 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2241 "Additionally, the file exists relative to the given search directory (%s). " 

2242 "Using the export file in the given directory.", 

2243 filename, 

2244 potential, 

2245 ) 

2246 # Given they specified an explicit directory and that 

2247 # directory has the export file in it, assume that that 

2248 # is what was meant despite the file in cwd. 

2249 filename = potential 

2250 elif exists_in_dir: 

2251 filename = potential 

2252 elif not exists_in_cwd and not exists_in_dir: 

2253 # Raise early. 

2254 raise FileNotFoundError( 

2255 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2256 ) 

2257 BackendClass: type[RepoImportBackend] = get_class_of( 

2258 self._config["repo_transfer_formats"][format]["import"] 

2259 ) 

2260 

2261 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2262 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

2263 backend.register() 

2264 with self.transaction(): 

2265 backend.load( 

2266 self._datastore, 

2267 directory=directory, 

2268 transfer=transfer, 

2269 skip_dimensions=skip_dimensions, 

2270 ) 

2271 

2272 if isinstance(filename, ResourcePath): 

2273 # We can not use open() here at the moment because of 

2274 # DM-38589 since yaml does stream.read(8192) in a loop. 

2275 stream = io.StringIO(filename.read().decode()) 

2276 doImport(stream) 

2277 else: 

2278 doImport(filename) # type: ignore 

2279 

2280 def transfer_from( 

2281 self, 

2282 source_butler: LimitedButler, 

2283 source_refs: Iterable[DatasetRef], 

2284 transfer: str = "auto", 

2285 skip_missing: bool = True, 

2286 register_dataset_types: bool = False, 

2287 transfer_dimensions: bool = False, 

2288 ) -> collections.abc.Collection[DatasetRef]: 

2289 """Transfer datasets to this Butler from a run in another Butler. 

2290 

2291 Parameters 

2292 ---------- 

2293 source_butler : `LimitedButler` 

2294 Butler from which the datasets are to be transferred. If data IDs 

2295 in ``source_refs`` are not expanded then this has to be a full 

2296 `Butler` whose registry will be used to expand data IDs. 

2297 source_refs : iterable of `DatasetRef` 

2298 Datasets defined in the source butler that should be transferred to 

2299 this butler. 

2300 transfer : `str`, optional 

2301 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2302 skip_missing : `bool` 

2303 If `True`, datasets with no datastore artifact associated with 

2304 them are not transferred. If `False` a registry entry will be 

2305 created even if no datastore record is created (and so will 

2306 look equivalent to the dataset being unstored). 

2307 register_dataset_types : `bool` 

2308 If `True` any missing dataset types are registered. Otherwise 

2309 an exception is raised. 

2310 transfer_dimensions : `bool`, optional 

2311 If `True`, dimension record data associated with the new datasets 

2312 will be transferred. 

2313 

2314 Returns 

2315 ------- 

2316 refs : `list` of `DatasetRef` 

2317 The refs added to this Butler. 

2318 

2319 Notes 

2320 ----- 

2321 The datastore artifact has to exist for a transfer 

2322 to be made but non-existence is not an error. 

2323 

2324 Datasets that already exist in this run will be skipped. 

2325 

2326 The datasets are imported as part of a transaction, although 

2327 dataset types are registered before the transaction is started. 

2328 This means that it is possible for a dataset type to be registered 

2329 even though transfer has failed. 

2330 """ 

2331 if not self.isWriteable(): 

2332 raise TypeError("Butler is read-only.") 

2333 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2334 

2335 # Will iterate through the refs multiple times so need to convert 

2336 # to a list if this isn't a collection. 

2337 if not isinstance(source_refs, collections.abc.Collection): 

2338 source_refs = list(source_refs) 

2339 

2340 original_count = len(source_refs) 

2341 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2342 

2343 # In some situations the datastore artifact may be missing 

2344 # and we do not want that registry entry to be imported. 

2345 # Asking datastore is not sufficient, the records may have been 

2346 # purged, we have to ask for the (predicted) URI and check 

2347 # existence explicitly. Execution butler is set up exactly like 

2348 # this with no datastore records. 

2349 artifact_existence: dict[ResourcePath, bool] = {} 

2350 if skip_missing: 

2351 dataset_existence = source_butler._datastore.mexists( 

2352 source_refs, artifact_existence=artifact_existence 

2353 ) 

2354 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2355 filtered_count = len(source_refs) 

2356 n_missing = original_count - filtered_count 

2357 log.verbose( 

2358 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2359 n_missing, 

2360 "" if n_missing == 1 else "s", 

2361 filtered_count, 

2362 ) 

2363 

2364 # Importing requires that we group the refs by dataset type and run 

2365 # before doing the import. 

2366 source_dataset_types = set() 

2367 grouped_refs = defaultdict(list) 

2368 for ref in source_refs: 

2369 grouped_refs[ref.datasetType, ref.run].append(ref) 

2370 source_dataset_types.add(ref.datasetType) 

2371 

2372 # Check to see if the dataset type in the source butler has 

2373 # the same definition in the target butler and register missing 

2374 # ones if requested. Registration must happen outside a transaction. 

2375 newly_registered_dataset_types = set() 

2376 for datasetType in source_dataset_types: 

2377 if register_dataset_types: 

2378 # Let this raise immediately if inconsistent. Continuing 

2379 # on to find additional inconsistent dataset types 

2380 # might result in additional unwanted dataset types being 

2381 # registered. 

2382 if self._registry.registerDatasetType(datasetType): 

2383 newly_registered_dataset_types.add(datasetType) 

2384 else: 

2385 # If the dataset type is missing, let it fail immediately. 

2386 target_dataset_type = self._registry.getDatasetType(datasetType.name) 

2387 if target_dataset_type != datasetType: 

2388 raise ConflictingDefinitionError( 

2389 "Source butler dataset type differs from definition" 

2390 f" in target butler: {datasetType} !=" 

2391 f" {target_dataset_type}" 

2392 ) 

2393 if newly_registered_dataset_types: 

2394 # We may have registered some even if there were inconsistencies 

2395 # but should let people know (or else remove them again). 

2396 log.log( 

2397 VERBOSE, 

2398 "Registered the following dataset types in the target Butler: %s", 

2399 ", ".join(d.name for d in newly_registered_dataset_types), 

2400 ) 

2401 else: 

2402 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2403 

2404 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2405 if transfer_dimensions: 

2406 # Collect all the dimension records for these refs. 

2407 # All dimensions are to be copied but the list of valid dimensions 

2408 # come from this butler's universe. 

2409 elements = frozenset( 

2410 element 

2411 for element in self.dimensions.getStaticElements() 

2412 if element.hasTable() and element.viewOf is None 

2413 ) 

2414 dataIds = {ref.dataId for ref in source_refs} 

2415 # This logic comes from saveDataIds. 

2416 for dataId in dataIds: 

2417 # Need an expanded record, if not expanded that we need a full 

2418 # butler with registry (allow mocks with registry too). 

2419 if not dataId.hasRecords(): 

2420 if registry := getattr(source_butler, "registry", None): 

2421 dataId = registry.expandDataId(dataId) 

2422 else: 

2423 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2424 # If this butler doesn't know about a dimension in the source 

2425 # butler things will break later. 

2426 for record in dataId.records.values(): 

2427 if record is not None and record.definition in elements: 

2428 dimension_records[record.definition].setdefault(record.dataId, record) 

2429 

2430 handled_collections: set[str] = set() 

2431 

2432 # Do all the importing in a single transaction. 

2433 with self.transaction(): 

2434 if dimension_records: 

2435 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2436 for element, r in dimension_records.items(): 

2437 records = [r[dataId] for dataId in r] 

2438 # Assume that if the record is already present that we can 

2439 # use it without having to check that the record metadata 

2440 # is consistent. 

2441 self._registry.insertDimensionData(element, *records, skip_existing=True) 

2442 

2443 n_imported = 0 

2444 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2445 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2446 ): 

2447 if run not in handled_collections: 

2448 # May need to create output collection. If source butler 

2449 # has a registry, ask for documentation string. 

2450 run_doc = None 

2451 if registry := getattr(source_butler, "registry", None): 

2452 run_doc = registry.getCollectionDocumentation(run) 

2453 registered = self._registry.registerRun(run, doc=run_doc) 

2454 handled_collections.add(run) 

2455 if registered: 

2456 log.log(VERBOSE, "Creating output run %s", run) 

2457 

2458 n_refs = len(refs_to_import) 

2459 log.verbose( 

2460 "Importing %d ref%s of dataset type %s into run %s", 

2461 n_refs, 

2462 "" if n_refs == 1 else "s", 

2463 datasetType.name, 

2464 run, 

2465 ) 

2466 

2467 # Assume we are using UUIDs and the source refs will match 

2468 # those imported. 

2469 imported_refs = self._registry._importDatasets(refs_to_import, expand=False) 

2470 assert set(imported_refs) == set(refs_to_import) 

2471 n_imported += len(imported_refs) 

2472 

2473 assert len(source_refs) == n_imported 

2474 log.verbose("Imported %d datasets into destination butler", n_imported) 

2475 

2476 # Ask the datastore to transfer. The datastore has to check that 

2477 # the source datastore is compatible with the target datastore. 

2478 accepted, rejected = self._datastore.transfer_from( 

2479 source_butler._datastore, 

2480 source_refs, 

2481 transfer=transfer, 

2482 artifact_existence=artifact_existence, 

2483 ) 

2484 if rejected: 

2485 # For now, accept the registry entries but not the files. 

2486 log.warning( 

2487 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2488 len(rejected), 

2489 len(accepted), 

2490 datasetType, 

2491 run, 

2492 ) 

2493 

2494 return source_refs 

2495 

2496 def validateConfiguration( 

2497 self, 

2498 logFailures: bool = False, 

2499 datasetTypeNames: Iterable[str] | None = None, 

2500 ignore: Iterable[str] | None = None, 

2501 ) -> None: 

2502 """Validate butler configuration. 

2503 

2504 Checks that each `DatasetType` can be stored in the `Datastore`. 

2505 

2506 Parameters 

2507 ---------- 

2508 logFailures : `bool`, optional 

2509 If `True`, output a log message for every validation error 

2510 detected. 

2511 datasetTypeNames : iterable of `str`, optional 

2512 The `DatasetType` names that should be checked. This allows 

2513 only a subset to be selected. 

2514 ignore : iterable of `str`, optional 

2515 Names of DatasetTypes to skip over. This can be used to skip 

2516 known problems. If a named `DatasetType` corresponds to a 

2517 composite, all components of that `DatasetType` will also be 

2518 ignored. 

2519 

2520 Raises 

2521 ------ 

2522 ButlerValidationError 

2523 Raised if there is some inconsistency with how this Butler 

2524 is configured. 

2525 """ 

2526 if datasetTypeNames: 

2527 datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames] 

2528 else: 

2529 datasetTypes = list(self._registry.queryDatasetTypes()) 

2530 

2531 # filter out anything from the ignore list 

2532 if ignore: 

2533 ignore = set(ignore) 

2534 datasetTypes = [ 

2535 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2536 ] 

2537 else: 

2538 ignore = set() 

2539 

2540 # For each datasetType that has an instrument dimension, create 

2541 # a DatasetRef for each defined instrument 

2542 datasetRefs = [] 

2543 

2544 # Find all the registered instruments (if "instrument" is in the 

2545 # universe). 

2546 if "instrument" in self.dimensions: 

2547 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2548 

2549 for datasetType in datasetTypes: 

2550 if "instrument" in datasetType.dimensions: 

2551 # In order to create a conforming dataset ref, create 

2552 # fake DataCoordinate values for the non-instrument 

2553 # dimensions. The type of the value does not matter here. 

2554 dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"} 

2555 

2556 for instrument in instruments: 

2557 datasetRef = DatasetRef( 

2558 datasetType, 

2559 DataCoordinate.standardize( 

2560 dataId, instrument=instrument, graph=datasetType.dimensions 

2561 ), 

2562 run="validate", 

2563 ) 

2564 datasetRefs.append(datasetRef) 

2565 

2566 entities: list[DatasetType | DatasetRef] = [] 

2567 entities.extend(datasetTypes) 

2568 entities.extend(datasetRefs) 

2569 

2570 datastoreErrorStr = None 

2571 try: 

2572 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2573 except ValidationError as e: 

2574 datastoreErrorStr = str(e) 

2575 

2576 # Also check that the LookupKeys used by the datastores match 

2577 # registry and storage class definitions 

2578 keys = self._datastore.getLookupKeys() 

2579 

2580 failedNames = set() 

2581 failedDataId = set() 

2582 for key in keys: 

2583 if key.name is not None: 

2584 if key.name in ignore: 

2585 continue 

2586 

2587 # skip if specific datasetType names were requested and this 

2588 # name does not match 

2589 if datasetTypeNames and key.name not in datasetTypeNames: 

2590 continue 

2591 

2592 # See if it is a StorageClass or a DatasetType 

2593 if key.name in self.storageClasses: 

2594 pass 

2595 else: 

2596 try: 

2597 self._registry.getDatasetType(key.name) 

2598 except KeyError: 

2599 if logFailures: 

2600 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2601 failedNames.add(key) 

2602 else: 

2603 # Dimensions are checked for consistency when the Butler 

2604 # is created and rendezvoused with a universe. 

2605 pass 

2606 

2607 # Check that the instrument is a valid instrument 

2608 # Currently only support instrument so check for that 

2609 if key.dataId: 

2610 dataIdKeys = set(key.dataId) 

2611 if {"instrument"} != dataIdKeys: 

2612 if logFailures: 

2613 log.critical("Key '%s' has unsupported DataId override", key) 

2614 failedDataId.add(key) 

2615 elif key.dataId["instrument"] not in instruments: 

2616 if logFailures: 

2617 log.critical("Key '%s' has unknown instrument", key) 

2618 failedDataId.add(key) 

2619 

2620 messages = [] 

2621 

2622 if datastoreErrorStr: 

2623 messages.append(datastoreErrorStr) 

2624 

2625 for failed, msg in ( 

2626 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2627 (failedDataId, "Keys with bad DataId entries: "), 

2628 ): 

2629 if failed: 

2630 msg += ", ".join(str(k) for k in failed) 

2631 messages.append(msg) 

2632 

2633 if messages: 

2634 raise ValidationError(";\n".join(messages)) 

2635 

2636 @property 

2637 def collections(self) -> Sequence[str]: 

2638 """The collections to search by default, in order 

2639 (`~collections.abc.Sequence` [ `str` ]). 

2640 

2641 This is an alias for ``self.registry.defaults.collections``. It cannot 

2642 be set directly in isolation, but all defaults may be changed together 

2643 by assigning a new `RegistryDefaults` instance to 

2644 ``self.registry.defaults``. 

2645 """ 

2646 return self._registry.defaults.collections 

2647 

2648 @property 

2649 def run(self) -> str | None: 

2650 """Name of the run this butler writes outputs to by default (`str` or 

2651 `None`). 

2652 

2653 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2654 directly in isolation, but all defaults may be changed together by 

2655 assigning a new `RegistryDefaults` instance to 

2656 ``self.registry.defaults``. 

2657 """ 

2658 return self._registry.defaults.run 

2659 

2660 @property 

2661 def registry(self) -> Registry: 

2662 """The object that manages dataset metadata and relationships 

2663 (`Registry`). 

2664 

2665 Many operations that don't involve reading or writing butler datasets 

2666 are accessible only via `Registry` methods. Eventually these methods 

2667 will be replaced by equivalent `Butler` methods. 

2668 """ 

2669 return self._registry_shim 

2670 

2671 @property 

2672 def dimensions(self) -> DimensionUniverse: 

2673 # Docstring inherited. 

2674 return self._registry.dimensions 

2675 

2676 _registry: _ButlerRegistry 

2677 """The object that manages dataset metadata and relationships 

2678 (`_ButlerRegistry`). 

2679 

2680 Most operations that don't involve reading or writing butler datasets are 

2681 accessible only via `Registry` methods. 

2682 """ 

2683 

2684 datastore: Datastore 

2685 """The object that manages actual dataset storage (`Datastore`). 

2686 

2687 Direct user access to the datastore should rarely be necessary; the primary 

2688 exception is the case where a `Datastore` implementation provides extra 

2689 functionality beyond what the base class defines. 

2690 """ 

2691 

2692 storageClasses: StorageClassFactory 

2693 """An object that maps known storage class names to objects that fully 

2694 describe them (`StorageClassFactory`). 

2695 """