Coverage for python/lsst/daf/butler/_butler.py: 11%

724 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-25 15:14 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Butler top level classes. 

23""" 

24from __future__ import annotations 

25 

26__all__ = ( 

27 "Butler", 

28 "ButlerValidationError", 

29) 

30 

31import collections.abc 

32import contextlib 

33import io 

34import logging 

35import numbers 

36import os 

37import warnings 

38from collections import Counter, defaultdict 

39from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

40from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

41 

42from deprecated.sphinx import deprecated 

43from lsst.resources import ResourcePath, ResourcePathExpression 

44from lsst.utils import doImportType 

45from lsst.utils.introspection import get_class_of 

46from lsst.utils.logging import VERBOSE, getLogger 

47from sqlalchemy.exc import IntegrityError 

48 

49from ._butlerConfig import ButlerConfig 

50from ._butlerRepoIndex import ButlerRepoIndex 

51from ._dataset_existence import DatasetExistence 

52from ._deferredDatasetHandle import DeferredDatasetHandle 

53from ._limited_butler import LimitedButler 

54from ._registry_shim import RegistryShim 

55from .core import ( 

56 Config, 

57 ConfigSubset, 

58 DataCoordinate, 

59 DataId, 

60 DataIdValue, 

61 DatasetIdGenEnum, 

62 DatasetRef, 

63 DatasetRefURIs, 

64 DatasetType, 

65 Datastore, 

66 Dimension, 

67 DimensionConfig, 

68 DimensionElement, 

69 DimensionRecord, 

70 DimensionUniverse, 

71 FileDataset, 

72 NullDatastore, 

73 Progress, 

74 StorageClass, 

75 StorageClassFactory, 

76 Timespan, 

77 ValidationError, 

78) 

79from .core.repoRelocation import BUTLER_ROOT_TAG 

80from .core.utils import transactional 

81from .registry import ( 

82 CollectionType, 

83 ConflictingDefinitionError, 

84 DataIdError, 

85 MissingDatasetTypeError, 

86 NoDefaultCollectionError, 

87 Registry, 

88 RegistryConfig, 

89 RegistryDefaults, 

90 _ButlerRegistry, 

91 _RegistryFactory, 

92) 

93from .transfers import RepoExportContext 

94 

95if TYPE_CHECKING: 

96 from lsst.resources import ResourceHandleProtocol 

97 

98 from .transfers import RepoImportBackend 

99 

100log = getLogger(__name__) 

101 

102 

103class ButlerValidationError(ValidationError): 

104 """There is a problem with the Butler configuration.""" 

105 

106 pass 

107 

108 

109class Butler(LimitedButler): 

110 """Main entry point for the data access system. 

111 

112 Parameters 

113 ---------- 

114 config : `ButlerConfig`, `Config` or `str`, optional. 

115 Configuration. Anything acceptable to the 

116 `ButlerConfig` constructor. If a directory path 

117 is given the configuration will be read from a ``butler.yaml`` file in 

118 that location. If `None` is given default values will be used. 

119 butler : `Butler`, optional. 

120 If provided, construct a new Butler that uses the same registry and 

121 datastore as the given one, but with the given collection and run. 

122 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

123 arguments. 

124 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

125 An expression specifying the collections to be searched (in order) when 

126 reading datasets. 

127 This may be a `str` collection name or an iterable thereof. 

128 See :ref:`daf_butler_collection_expressions` for more information. 

129 These collections are not registered automatically and must be 

130 manually registered before they are used by any method, but they may be 

131 manually registered after the `Butler` is initialized. 

132 run : `str`, optional 

133 Name of the `~CollectionType.RUN` collection new datasets should be 

134 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

135 ``collections`` will be set to ``[run]``. If not `None`, this 

136 collection will automatically be registered. If this is not set (and 

137 ``writeable`` is not set either), a read-only butler will be created. 

138 searchPaths : `list` of `str`, optional 

139 Directory paths to search when calculating the full Butler 

140 configuration. Not used if the supplied config is already a 

141 `ButlerConfig`. 

142 writeable : `bool`, optional 

143 Explicitly sets whether the butler supports write operations. If not 

144 provided, a read-write butler is created if any of ``run``, ``tags``, 

145 or ``chains`` is non-empty. 

146 inferDefaults : `bool`, optional 

147 If `True` (default) infer default data ID values from the values 

148 present in the datasets in ``collections``: if all collections have the 

149 same value (or no value) for a governor dimension, that value will be 

150 the default for that dimension. Nonexistent collections are ignored. 

151 If a default value is provided explicitly for a governor dimension via 

152 ``**kwargs``, no default will be inferred for that dimension. 

153 without_datastore : `bool`, optional 

154 If `True` do not attach a datastore to this butler. Any attempts 

155 to use a datastore will fail. 

156 **kwargs : `str` 

157 Default data ID key-value pairs. These may only identify "governor" 

158 dimensions like ``instrument`` and ``skymap``. 

159 

160 Examples 

161 -------- 

162 While there are many ways to control exactly how a `Butler` interacts with 

163 the collections in its `Registry`, the most common cases are still simple. 

164 

165 For a read-only `Butler` that searches one collection, do:: 

166 

167 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

168 

169 For a read-write `Butler` that writes to and reads from a 

170 `~CollectionType.RUN` collection:: 

171 

172 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

173 

174 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

175 because we want to write to one `~CollectionType.RUN` collection but read 

176 from several others (as well):: 

177 

178 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

179 collections=["u/alice/DM-50000/a", 

180 "u/bob/DM-49998", 

181 "HSC/defaults"]) 

182 

183 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

184 Datasets will be read first from that run (since it appears first in the 

185 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

186 

187 Finally, one can always create a `Butler` with no collections:: 

188 

189 butler = Butler("/path/to/repo", writeable=True) 

190 

191 This can be extremely useful when you just want to use ``butler.registry``, 

192 e.g. for inserting dimension data or managing collections, or when the 

193 collections you want to use with the butler are not consistent. 

194 Passing ``writeable`` explicitly here is only necessary if you want to be 

195 able to make changes to the repo - usually the value for ``writeable`` can 

196 be guessed from the collection arguments provided, but it defaults to 

197 `False` when there are not collection arguments. 

198 """ 

199 

200 def __init__( 

201 self, 

202 config: Config | ResourcePathExpression | None = None, 

203 *, 

204 butler: Butler | None = None, 

205 collections: Any = None, 

206 run: str | None = None, 

207 searchPaths: Sequence[ResourcePathExpression] | None = None, 

208 writeable: bool | None = None, 

209 inferDefaults: bool = True, 

210 without_datastore: bool = False, 

211 **kwargs: str, 

212 ): 

213 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

214 # Load registry, datastore, etc. from config or existing butler. 

215 if butler is not None: 

216 if config is not None or searchPaths is not None or writeable is not None: 

217 raise TypeError( 

218 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

219 ) 

220 self._registry = butler._registry.copy(defaults) 

221 self._datastore = butler._datastore 

222 self.storageClasses = butler.storageClasses 

223 self._config: ButlerConfig = butler._config 

224 else: 

225 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore) 

226 try: 

227 butlerRoot = self._config.get("root", self._config.configDir) 

228 if writeable is None: 

229 writeable = run is not None 

230 self._registry = _RegistryFactory(self._config).from_config( 

231 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

232 ) 

233 if without_datastore: 

234 self._datastore = NullDatastore(None, None) 

235 else: 

236 self._datastore = Datastore.fromConfig( 

237 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

238 ) 

239 self.storageClasses = StorageClassFactory() 

240 self.storageClasses.addFromConfig(self._config) 

241 except Exception: 

242 # Failures here usually mean that configuration is incomplete, 

243 # just issue an error message which includes config file URI. 

244 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

245 raise 

246 

247 # For execution butler the datastore needs a special 

248 # dependency-inversion trick. This is not used by regular butler, 

249 # but we do not have a way to distinguish regular butler from execution 

250 # butler. 

251 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

252 

253 if "run" in self._config or "collection" in self._config: 

254 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

255 

256 self._registry_shim = RegistryShim(self) 

257 

258 GENERATION: ClassVar[int] = 3 

259 """This is a Generation 3 Butler. 

260 

261 This attribute may be removed in the future, once the Generation 2 Butler 

262 interface has been fully retired; it should only be used in transitional 

263 code. 

264 """ 

265 

266 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

267 """Return DatasetType defined in registry given dataset type name.""" 

268 try: 

269 return self._registry.getDatasetType(name) 

270 except MissingDatasetTypeError: 

271 return None 

272 

273 @classmethod 

274 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: 

275 """Look up the label in a butler repository index. 

276 

277 Parameters 

278 ---------- 

279 label : `str` 

280 Label of the Butler repository to look up. 

281 return_label : `bool`, optional 

282 If ``label`` cannot be found in the repository index (either 

283 because index is not defined or ``label`` is not in the index) and 

284 ``return_label`` is `True` then return ``ResourcePath(label)``. 

285 If ``return_label`` is `False` (default) then an exception will be 

286 raised instead. 

287 

288 Returns 

289 ------- 

290 uri : `lsst.resources.ResourcePath` 

291 URI to the Butler repository associated with the given label or 

292 default value if it is provided. 

293 

294 Raises 

295 ------ 

296 KeyError 

297 Raised if the label is not found in the index, or if an index 

298 is not defined, and ``return_label`` is `False`. 

299 

300 Notes 

301 ----- 

302 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

303 information is discovered. 

304 """ 

305 return ButlerRepoIndex.get_repo_uri(label, return_label) 

306 

307 @classmethod 

308 def get_known_repos(cls) -> set[str]: 

309 """Retrieve the list of known repository labels. 

310 

311 Returns 

312 ------- 

313 repos : `set` of `str` 

314 All the known labels. Can be empty if no index can be found. 

315 

316 Notes 

317 ----- 

318 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

319 information is discovered. 

320 """ 

321 return ButlerRepoIndex.get_known_repos() 

322 

323 @staticmethod 

324 def makeRepo( 

325 root: ResourcePathExpression, 

326 config: Config | str | None = None, 

327 dimensionConfig: Config | str | None = None, 

328 standalone: bool = False, 

329 searchPaths: list[str] | None = None, 

330 forceConfigRoot: bool = True, 

331 outfile: ResourcePathExpression | None = None, 

332 overwrite: bool = False, 

333 ) -> Config: 

334 """Create an empty data repository by adding a butler.yaml config 

335 to a repository root directory. 

336 

337 Parameters 

338 ---------- 

339 root : `lsst.resources.ResourcePathExpression` 

340 Path or URI to the root location of the new repository. Will be 

341 created if it does not exist. 

342 config : `Config` or `str`, optional 

343 Configuration to write to the repository, after setting any 

344 root-dependent Registry or Datastore config options. Can not 

345 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

346 configuration will be used. Root-dependent config options 

347 specified in this config are overwritten if ``forceConfigRoot`` 

348 is `True`. 

349 dimensionConfig : `Config` or `str`, optional 

350 Configuration for dimensions, will be used to initialize registry 

351 database. 

352 standalone : `bool` 

353 If True, write all expanded defaults, not just customized or 

354 repository-specific settings. 

355 This (mostly) decouples the repository from the default 

356 configuration, insulating it from changes to the defaults (which 

357 may be good or bad, depending on the nature of the changes). 

358 Future *additions* to the defaults will still be picked up when 

359 initializing `Butlers` to repos created with ``standalone=True``. 

360 searchPaths : `list` of `str`, optional 

361 Directory paths to search when calculating the full butler 

362 configuration. 

363 forceConfigRoot : `bool`, optional 

364 If `False`, any values present in the supplied ``config`` that 

365 would normally be reset are not overridden and will appear 

366 directly in the output config. This allows non-standard overrides 

367 of the root directory for a datastore or registry to be given. 

368 If this parameter is `True` the values for ``root`` will be 

369 forced into the resulting config if appropriate. 

370 outfile : `lss.resources.ResourcePathExpression`, optional 

371 If not-`None`, the output configuration will be written to this 

372 location rather than into the repository itself. Can be a URI 

373 string. Can refer to a directory that will be used to write 

374 ``butler.yaml``. 

375 overwrite : `bool`, optional 

376 Create a new configuration file even if one already exists 

377 in the specified output location. Default is to raise 

378 an exception. 

379 

380 Returns 

381 ------- 

382 config : `Config` 

383 The updated `Config` instance written to the repo. 

384 

385 Raises 

386 ------ 

387 ValueError 

388 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

389 regular Config (as these subclasses would make it impossible to 

390 support ``standalone=False``). 

391 FileExistsError 

392 Raised if the output config file already exists. 

393 os.error 

394 Raised if the directory does not exist, exists but is not a 

395 directory, or cannot be created. 

396 

397 Notes 

398 ----- 

399 Note that when ``standalone=False`` (the default), the configuration 

400 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

401 construct the repository should also be used to construct any Butlers 

402 to avoid configuration inconsistencies. 

403 """ 

404 if isinstance(config, ButlerConfig | ConfigSubset): 

405 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

406 

407 # Ensure that the root of the repository exists or can be made 

408 root_uri = ResourcePath(root, forceDirectory=True) 

409 root_uri.mkdir() 

410 

411 config = Config(config) 

412 

413 # If we are creating a new repo from scratch with relative roots, 

414 # do not propagate an explicit root from the config file 

415 if "root" in config: 

416 del config["root"] 

417 

418 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

419 imported_class = doImportType(full["datastore", "cls"]) 

420 if not issubclass(imported_class, Datastore): 

421 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

422 datastoreClass: type[Datastore] = imported_class 

423 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

424 

425 # if key exists in given config, parse it, otherwise parse the defaults 

426 # in the expanded config 

427 if config.get(("registry", "db")): 

428 registryConfig = RegistryConfig(config) 

429 else: 

430 registryConfig = RegistryConfig(full) 

431 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

432 if defaultDatabaseUri is not None: 

433 Config.updateParameters( 

434 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

435 ) 

436 else: 

437 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

438 

439 if standalone: 

440 config.merge(full) 

441 else: 

442 # Always expand the registry.managers section into the per-repo 

443 # config, because after the database schema is created, it's not 

444 # allowed to change anymore. Note that in the standalone=True 

445 # branch, _everything_ in the config is expanded, so there's no 

446 # need to special case this. 

447 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

448 configURI: ResourcePathExpression 

449 if outfile is not None: 

450 # When writing to a separate location we must include 

451 # the root of the butler repo in the config else it won't know 

452 # where to look. 

453 config["root"] = root_uri.geturl() 

454 configURI = outfile 

455 else: 

456 configURI = root_uri 

457 # Strip obscore configuration, if it is present, before writing config 

458 # to a file, obscore config will be stored in registry. 

459 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

460 config_to_write = config.copy() 

461 del config_to_write[obscore_config_key] 

462 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

463 # configFile attribute is updated, need to copy it to original. 

464 config.configFile = config_to_write.configFile 

465 else: 

466 config.dumpToUri(configURI, overwrite=overwrite) 

467 

468 # Create Registry and populate tables 

469 registryConfig = RegistryConfig(config.get("registry")) 

470 dimensionConfig = DimensionConfig(dimensionConfig) 

471 _RegistryFactory(registryConfig).create_from_config( 

472 dimensionConfig=dimensionConfig, butlerRoot=root_uri 

473 ) 

474 

475 log.verbose("Wrote new Butler configuration file to %s", configURI) 

476 

477 return config 

478 

479 @classmethod 

480 def _unpickle( 

481 cls, 

482 config: ButlerConfig, 

483 collections: tuple[str, ...] | None, 

484 run: str | None, 

485 defaultDataId: dict[str, str], 

486 writeable: bool, 

487 ) -> Butler: 

488 """Callable used to unpickle a Butler. 

489 

490 We prefer not to use ``Butler.__init__`` directly so we can force some 

491 of its many arguments to be keyword-only (note that ``__reduce__`` 

492 can only invoke callables with positional arguments). 

493 

494 Parameters 

495 ---------- 

496 config : `ButlerConfig` 

497 Butler configuration, already coerced into a true `ButlerConfig` 

498 instance (and hence after any search paths for overrides have been 

499 utilized). 

500 collections : `tuple` [ `str` ] 

501 Names of the default collections to read from. 

502 run : `str`, optional 

503 Name of the default `~CollectionType.RUN` collection to write to. 

504 defaultDataId : `dict` [ `str`, `str` ] 

505 Default data ID values. 

506 writeable : `bool` 

507 Whether the Butler should support write operations. 

508 

509 Returns 

510 ------- 

511 butler : `Butler` 

512 A new `Butler` instance. 

513 """ 

514 # MyPy doesn't recognize that the kwargs below are totally valid; it 

515 # seems to think '**defaultDataId* is a _positional_ argument! 

516 return cls( 

517 config=config, 

518 collections=collections, 

519 run=run, 

520 writeable=writeable, 

521 **defaultDataId, # type: ignore 

522 ) 

523 

524 def __reduce__(self) -> tuple: 

525 """Support pickling.""" 

526 return ( 

527 Butler._unpickle, 

528 ( 

529 self._config, 

530 self.collections, 

531 self.run, 

532 self._registry.defaults.dataId.byName(), 

533 self._registry.isWriteable(), 

534 ), 

535 ) 

536 

537 def __str__(self) -> str: 

538 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

539 self.collections, self.run, self._datastore, self._registry 

540 ) 

541 

542 def isWriteable(self) -> bool: 

543 """Return `True` if this `Butler` supports write operations.""" 

544 return self._registry.isWriteable() 

545 

546 @contextlib.contextmanager 

547 def transaction(self) -> Iterator[None]: 

548 """Context manager supporting `Butler` transactions. 

549 

550 Transactions can be nested. 

551 """ 

552 with self._registry.transaction(), self._datastore.transaction(): 

553 yield 

554 

555 def _standardizeArgs( 

556 self, 

557 datasetRefOrType: DatasetRef | DatasetType | str, 

558 dataId: DataId | None = None, 

559 for_put: bool = True, 

560 **kwargs: Any, 

561 ) -> tuple[DatasetType, DataId | None]: 

562 """Standardize the arguments passed to several Butler APIs. 

563 

564 Parameters 

565 ---------- 

566 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

567 When `DatasetRef` the `dataId` should be `None`. 

568 Otherwise the `DatasetType` or name thereof. 

569 dataId : `dict` or `DataCoordinate` 

570 A `dict` of `Dimension` link name, value pairs that label the 

571 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

572 should be provided as the second argument. 

573 for_put : `bool`, optional 

574 If `True` this call is invoked as part of a `Butler.put()`. 

575 Otherwise it is assumed to be part of a `Butler.get()`. This 

576 parameter is only relevant if there is dataset type 

577 inconsistency. 

578 **kwargs 

579 Additional keyword arguments used to augment or construct a 

580 `DataCoordinate`. See `DataCoordinate.standardize` 

581 parameters. 

582 

583 Returns 

584 ------- 

585 datasetType : `DatasetType` 

586 A `DatasetType` instance extracted from ``datasetRefOrType``. 

587 dataId : `dict` or `DataId`, optional 

588 Argument that can be used (along with ``kwargs``) to construct a 

589 `DataId`. 

590 

591 Notes 

592 ----- 

593 Butler APIs that conceptually need a DatasetRef also allow passing a 

594 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

595 keyword arguments that can be used to construct one) separately. This 

596 method accepts those arguments and always returns a true `DatasetType` 

597 and a `DataId` or `dict`. 

598 

599 Standardization of `dict` vs `DataId` is best handled by passing the 

600 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

601 generally similarly flexible. 

602 """ 

603 externalDatasetType: DatasetType | None = None 

604 internalDatasetType: DatasetType | None = None 

605 if isinstance(datasetRefOrType, DatasetRef): 

606 if dataId is not None or kwargs: 

607 raise ValueError("DatasetRef given, cannot use dataId as well") 

608 externalDatasetType = datasetRefOrType.datasetType 

609 dataId = datasetRefOrType.dataId 

610 else: 

611 # Don't check whether DataId is provided, because Registry APIs 

612 # can usually construct a better error message when it wasn't. 

613 if isinstance(datasetRefOrType, DatasetType): 

614 externalDatasetType = datasetRefOrType 

615 else: 

616 internalDatasetType = self._registry.getDatasetType(datasetRefOrType) 

617 

618 # Check that they are self-consistent 

619 if externalDatasetType is not None: 

620 internalDatasetType = self._registry.getDatasetType(externalDatasetType.name) 

621 if externalDatasetType != internalDatasetType: 

622 # We can allow differences if they are compatible, depending 

623 # on whether this is a get or a put. A get requires that 

624 # the python type associated with the datastore can be 

625 # converted to the user type. A put requires that the user 

626 # supplied python type can be converted to the internal 

627 # type expected by registry. 

628 relevantDatasetType = internalDatasetType 

629 if for_put: 

630 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

631 else: 

632 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

633 relevantDatasetType = externalDatasetType 

634 if not is_compatible: 

635 raise ValueError( 

636 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

637 f"registry definition ({internalDatasetType})" 

638 ) 

639 # Override the internal definition. 

640 internalDatasetType = relevantDatasetType 

641 

642 assert internalDatasetType is not None 

643 return internalDatasetType, dataId 

644 

645 def _rewrite_data_id( 

646 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

647 ) -> tuple[DataId | None, dict[str, Any]]: 

648 """Rewrite a data ID taking into account dimension records. 

649 

650 Take a Data ID and keyword args and rewrite it if necessary to 

651 allow the user to specify dimension records rather than dimension 

652 primary values. 

653 

654 This allows a user to include a dataId dict with keys of 

655 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

656 the integer exposure ID. It also allows a string to be given 

657 for a dimension value rather than the integer ID if that is more 

658 convenient. For example, rather than having to specifying the 

659 detector with ``detector.full_name``, a string given for ``detector`` 

660 will be interpreted as the full name and converted to the integer 

661 value. 

662 

663 Keyword arguments can also use strings for dimensions like detector 

664 and exposure but python does not allow them to include ``.`` and 

665 so the ``exposure.day_obs`` syntax can not be used in a keyword 

666 argument. 

667 

668 Parameters 

669 ---------- 

670 dataId : `dict` or `DataCoordinate` 

671 A `dict` of `Dimension` link name, value pairs that will label the 

672 `DatasetRef` within a Collection. 

673 datasetType : `DatasetType` 

674 The dataset type associated with this dataId. Required to 

675 determine the relevant dimensions. 

676 **kwargs 

677 Additional keyword arguments used to augment or construct a 

678 `DataId`. See `DataId` parameters. 

679 

680 Returns 

681 ------- 

682 dataId : `dict` or `DataCoordinate` 

683 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

684 no keyword arguments, the original dataId will be returned 

685 unchanged. 

686 **kwargs : `dict` 

687 Any unused keyword arguments (would normally be empty dict). 

688 """ 

689 # Do nothing if we have a standalone DataCoordinate. 

690 if isinstance(dataId, DataCoordinate) and not kwargs: 

691 return dataId, kwargs 

692 

693 # Process dimension records that are using record information 

694 # rather than ids 

695 newDataId: dict[str, DataIdValue] = {} 

696 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

697 

698 # if all the dataId comes from keyword parameters we do not need 

699 # to do anything here because they can't be of the form 

700 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

701 if dataId: 

702 for k, v in dataId.items(): 

703 # If we have a Dimension we do not need to do anything 

704 # because it cannot be a compound key. 

705 if isinstance(k, str) and "." in k: 

706 # Someone is using a more human-readable dataId 

707 dimensionName, record = k.split(".", 1) 

708 byRecord[dimensionName][record] = v 

709 elif isinstance(k, Dimension): 

710 newDataId[k.name] = v 

711 else: 

712 newDataId[k] = v 

713 

714 # Go through the updated dataId and check the type in case someone is 

715 # using an alternate key. We have already filtered out the compound 

716 # keys dimensions.record format. 

717 not_dimensions = {} 

718 

719 # Will need to look in the dataId and the keyword arguments 

720 # and will remove them if they need to be fixed or are unrecognized. 

721 for dataIdDict in (newDataId, kwargs): 

722 # Use a list so we can adjust the dict safely in the loop 

723 for dimensionName in list(dataIdDict): 

724 value = dataIdDict[dimensionName] 

725 try: 

726 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

727 except KeyError: 

728 # This is not a real dimension 

729 not_dimensions[dimensionName] = value 

730 del dataIdDict[dimensionName] 

731 continue 

732 

733 # Convert an integral type to an explicit int to simplify 

734 # comparisons here 

735 if isinstance(value, numbers.Integral): 

736 value = int(value) 

737 

738 if not isinstance(value, dimension.primaryKey.getPythonType()): 

739 for alternate in dimension.alternateKeys: 

740 if isinstance(value, alternate.getPythonType()): 

741 byRecord[dimensionName][alternate.name] = value 

742 del dataIdDict[dimensionName] 

743 log.debug( 

744 "Converting dimension %s to %s.%s=%s", 

745 dimensionName, 

746 dimensionName, 

747 alternate.name, 

748 value, 

749 ) 

750 break 

751 else: 

752 log.warning( 

753 "Type mismatch found for value '%r' provided for dimension %s. " 

754 "Could not find matching alternative (primary key has type %s) " 

755 "so attempting to use as-is.", 

756 value, 

757 dimensionName, 

758 dimension.primaryKey.getPythonType(), 

759 ) 

760 

761 # By this point kwargs and newDataId should only include valid 

762 # dimensions. Merge kwargs in to the new dataId and log if there 

763 # are dimensions in both (rather than calling update). 

764 for k, v in kwargs.items(): 

765 if k in newDataId and newDataId[k] != v: 

766 log.debug( 

767 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

768 ) 

769 newDataId[k] = v 

770 # No need to retain any values in kwargs now. 

771 kwargs = {} 

772 

773 # If we have some unrecognized dimensions we have to try to connect 

774 # them to records in other dimensions. This is made more complicated 

775 # by some dimensions having records with clashing names. A mitigation 

776 # is that we can tell by this point which dimensions are missing 

777 # for the DatasetType but this does not work for calibrations 

778 # where additional dimensions can be used to constrain the temporal 

779 # axis. 

780 if not_dimensions: 

781 # Search for all dimensions even if we have been given a value 

782 # explicitly. In some cases records are given as well as the 

783 # actually dimension and this should not be an error if they 

784 # match. 

785 mandatoryDimensions = datasetType.dimensions.names # - provided 

786 

787 candidateDimensions: set[str] = set() 

788 candidateDimensions.update(mandatoryDimensions) 

789 

790 # For calibrations we may well be needing temporal dimensions 

791 # so rather than always including all dimensions in the scan 

792 # restrict things a little. It is still possible for there 

793 # to be confusion over day_obs in visit vs exposure for example. 

794 # If we are not searching calibration collections things may 

795 # fail but they are going to fail anyway because of the 

796 # ambiguousness of the dataId... 

797 if datasetType.isCalibration(): 

798 for dim in self.dimensions.getStaticDimensions(): 

799 if dim.temporal: 

800 candidateDimensions.add(str(dim)) 

801 

802 # Look up table for the first association with a dimension 

803 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

804 

805 # Keep track of whether an item is associated with multiple 

806 # dimensions. 

807 counter: Counter[str] = Counter() 

808 assigned: dict[str, set[str]] = defaultdict(set) 

809 

810 # Go through the missing dimensions and associate the 

811 # given names with records within those dimensions 

812 matched_dims = set() 

813 for dimensionName in candidateDimensions: 

814 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

815 fields = dimension.metadata.names | dimension.uniqueKeys.names 

816 for field in not_dimensions: 

817 if field in fields: 

818 guessedAssociation[dimensionName][field] = not_dimensions[field] 

819 counter[dimensionName] += 1 

820 assigned[field].add(dimensionName) 

821 matched_dims.add(field) 

822 

823 # Calculate the fields that matched nothing. 

824 never_found = set(not_dimensions) - matched_dims 

825 

826 if never_found: 

827 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

828 

829 # There is a chance we have allocated a single dataId item 

830 # to multiple dimensions. Need to decide which should be retained. 

831 # For now assume that the most popular alternative wins. 

832 # This means that day_obs with seq_num will result in 

833 # exposure.day_obs and not visit.day_obs 

834 # Also prefer an explicitly missing dimension over an inferred 

835 # temporal dimension. 

836 for fieldName, assignedDimensions in assigned.items(): 

837 if len(assignedDimensions) > 1: 

838 # Pick the most popular (preferring mandatory dimensions) 

839 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

840 if requiredButMissing: 

841 candidateDimensions = requiredButMissing 

842 else: 

843 candidateDimensions = assignedDimensions 

844 

845 # If this is a choice between visit and exposure and 

846 # neither was a required part of the dataset type, 

847 # (hence in this branch) always prefer exposure over 

848 # visit since exposures are always defined and visits 

849 # are defined from exposures. 

850 if candidateDimensions == {"exposure", "visit"}: 

851 candidateDimensions = {"exposure"} 

852 

853 # Select the relevant items and get a new restricted 

854 # counter. 

855 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

856 duplicatesCounter: Counter[str] = Counter() 

857 duplicatesCounter.update(theseCounts) 

858 

859 # Choose the most common. If they are equally common 

860 # we will pick the one that was found first. 

861 # Returns a list of tuples 

862 selected = duplicatesCounter.most_common(1)[0][0] 

863 

864 log.debug( 

865 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

866 " Removed ambiguity by choosing dimension %s.", 

867 fieldName, 

868 ", ".join(assignedDimensions), 

869 selected, 

870 ) 

871 

872 for candidateDimension in assignedDimensions: 

873 if candidateDimension != selected: 

874 del guessedAssociation[candidateDimension][fieldName] 

875 

876 # Update the record look up dict with the new associations 

877 for dimensionName, values in guessedAssociation.items(): 

878 if values: # A dict might now be empty 

879 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

880 byRecord[dimensionName].update(values) 

881 

882 if byRecord: 

883 # Some record specifiers were found so we need to convert 

884 # them to the Id form 

885 for dimensionName, values in byRecord.items(): 

886 if dimensionName in newDataId: 

887 log.debug( 

888 "DataId specified explicit %s dimension value of %s in addition to" 

889 " general record specifiers for it of %s. Ignoring record information.", 

890 dimensionName, 

891 newDataId[dimensionName], 

892 str(values), 

893 ) 

894 # Get the actual record and compare with these values. 

895 try: 

896 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

897 except DataIdError: 

898 raise ValueError( 

899 f"Could not find dimension '{dimensionName}'" 

900 f" with dataId {newDataId} as part of comparing with" 

901 f" record values {byRecord[dimensionName]}" 

902 ) from None 

903 if len(recs) == 1: 

904 errmsg: list[str] = [] 

905 for k, v in values.items(): 

906 if (recval := getattr(recs[0], k)) != v: 

907 errmsg.append(f"{k}({recval} != {v})") 

908 if errmsg: 

909 raise ValueError( 

910 f"Dimension {dimensionName} in dataId has explicit value" 

911 " inconsistent with records: " + ", ".join(errmsg) 

912 ) 

913 else: 

914 # Multiple matches for an explicit dimension 

915 # should never happen but let downstream complain. 

916 pass 

917 continue 

918 

919 # Build up a WHERE expression 

920 bind = dict(values.items()) 

921 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

922 

923 # Hopefully we get a single record that matches 

924 records = set( 

925 self._registry.queryDimensionRecords( 

926 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

927 ) 

928 ) 

929 

930 if len(records) != 1: 

931 if len(records) > 1: 

932 # visit can have an ambiguous answer without involving 

933 # visit_system. The default visit_system is defined 

934 # by the instrument. 

935 if ( 

936 dimensionName == "visit" 

937 and "visit_system_membership" in self.dimensions 

938 and "visit_system" in self.dimensions["instrument"].metadata 

939 ): 

940 instrument_records = list( 

941 self._registry.queryDimensionRecords( 

942 "instrument", 

943 dataId=newDataId, 

944 **kwargs, 

945 ) 

946 ) 

947 if len(instrument_records) == 1: 

948 visit_system = instrument_records[0].visit_system 

949 if visit_system is None: 

950 # Set to a value that will never match. 

951 visit_system = -1 

952 

953 # Look up each visit in the 

954 # visit_system_membership records. 

955 for rec in records: 

956 membership = list( 

957 self._registry.queryDimensionRecords( 

958 # Use bind to allow zero results. 

959 # This is a fully-specified query. 

960 "visit_system_membership", 

961 where="instrument = inst AND visit_system = system AND visit = v", 

962 bind=dict( 

963 inst=instrument_records[0].name, system=visit_system, v=rec.id 

964 ), 

965 ) 

966 ) 

967 if membership: 

968 # This record is the right answer. 

969 records = {rec} 

970 break 

971 

972 # The ambiguity may have been resolved so check again. 

973 if len(records) > 1: 

974 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

975 for r in records: 

976 log.debug("- %s", str(r)) 

977 raise ValueError( 

978 f"DataId specification for dimension {dimensionName} is not" 

979 f" uniquely constrained to a single dataset by {values}." 

980 f" Got {len(records)} results." 

981 ) 

982 else: 

983 raise ValueError( 

984 f"DataId specification for dimension {dimensionName} matched no" 

985 f" records when constrained by {values}" 

986 ) 

987 

988 # Get the primary key from the real dimension object 

989 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

990 if not isinstance(dimension, Dimension): 

991 raise RuntimeError( 

992 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

993 ) 

994 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

995 

996 return newDataId, kwargs 

997 

998 def _findDatasetRef( 

999 self, 

1000 datasetRefOrType: DatasetRef | DatasetType | str, 

1001 dataId: DataId | None = None, 

1002 *, 

1003 collections: Any = None, 

1004 predict: bool = False, 

1005 run: str | None = None, 

1006 **kwargs: Any, 

1007 ) -> DatasetRef: 

1008 """Shared logic for methods that start with a search for a dataset in 

1009 the registry. 

1010 

1011 Parameters 

1012 ---------- 

1013 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1014 When `DatasetRef` the `dataId` should be `None`. 

1015 Otherwise the `DatasetType` or name thereof. 

1016 dataId : `dict` or `DataCoordinate`, optional 

1017 A `dict` of `Dimension` link name, value pairs that label the 

1018 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1019 should be provided as the first argument. 

1020 collections : Any, optional 

1021 Collections to be searched, overriding ``self.collections``. 

1022 Can be any of the types supported by the ``collections`` argument 

1023 to butler construction. 

1024 predict : `bool`, optional 

1025 If `True`, return a newly created `DatasetRef` with a unique 

1026 dataset ID if finding a reference in the `Registry` fails. 

1027 Defaults to `False`. 

1028 run : `str`, optional 

1029 Run collection name to use for creating `DatasetRef` for predicted 

1030 datasets. Only used if ``predict`` is `True`. 

1031 **kwargs 

1032 Additional keyword arguments used to augment or construct a 

1033 `DataId`. See `DataId` parameters. 

1034 

1035 Returns 

1036 ------- 

1037 ref : `DatasetRef` 

1038 A reference to the dataset identified by the given arguments. 

1039 This can be the same dataset reference as given if it was 

1040 resolved. 

1041 

1042 Raises 

1043 ------ 

1044 LookupError 

1045 Raised if no matching dataset exists in the `Registry` (and 

1046 ``predict`` is `False`). 

1047 ValueError 

1048 Raised if a resolved `DatasetRef` was passed as an input, but it 

1049 differs from the one found in the registry. 

1050 TypeError 

1051 Raised if no collections were provided. 

1052 """ 

1053 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1054 if isinstance(datasetRefOrType, DatasetRef): 

1055 if collections is not None: 

1056 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

1057 return datasetRefOrType 

1058 timespan: Timespan | None = None 

1059 

1060 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1061 

1062 if datasetType.isCalibration(): 

1063 # Because this is a calibration dataset, first try to make a 

1064 # standardize the data ID without restricting the dimensions to 

1065 # those of the dataset type requested, because there may be extra 

1066 # dimensions that provide temporal information for a validity-range 

1067 # lookup. 

1068 dataId = DataCoordinate.standardize( 

1069 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

1070 ) 

1071 if dataId.graph.temporal: 

1072 dataId = self._registry.expandDataId(dataId) 

1073 timespan = dataId.timespan 

1074 else: 

1075 # Standardize the data ID to just the dimensions of the dataset 

1076 # type instead of letting registry.findDataset do it, so we get the 

1077 # result even if no dataset is found. 

1078 dataId = DataCoordinate.standardize( 

1079 dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

1080 ) 

1081 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1082 # present in the current collection. 

1083 ref = self._registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1084 if ref is None: 

1085 if predict: 

1086 if run is None: 

1087 run = self.run 

1088 if run is None: 

1089 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1090 return DatasetRef(datasetType, dataId, run=run) 

1091 else: 

1092 if collections is None: 

1093 collections = self._registry.defaults.collections 

1094 raise LookupError( 

1095 f"Dataset {datasetType.name} with data ID {dataId} " 

1096 f"could not be found in collections {collections}." 

1097 ) 

1098 if datasetType != ref.datasetType: 

1099 # If they differ it is because the user explicitly specified 

1100 # a compatible dataset type to this call rather than using the 

1101 # registry definition. The DatasetRef must therefore be recreated 

1102 # using the user definition such that the expected type is 

1103 # returned. 

1104 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1105 

1106 return ref 

1107 

1108 # TODO: remove on DM-40067. 

1109 @transactional 

1110 @deprecated( 

1111 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1112 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1113 " were relying on the run parameter to determine the run." 

1114 " Will be removed after v26.0.", 

1115 version="v26.0", 

1116 category=FutureWarning, 

1117 ) 

1118 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1119 # Docstring inherited. 

1120 return self.put(obj, ref) 

1121 

1122 @transactional 

1123 def put( 

1124 self, 

1125 obj: Any, 

1126 datasetRefOrType: DatasetRef | DatasetType | str, 

1127 /, 

1128 dataId: DataId | None = None, 

1129 *, 

1130 run: str | None = None, 

1131 **kwargs: Any, 

1132 ) -> DatasetRef: 

1133 """Store and register a dataset. 

1134 

1135 Parameters 

1136 ---------- 

1137 obj : `object` 

1138 The dataset. 

1139 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1140 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1141 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1142 `DatasetRef` is given the run and ID are used directly. 

1143 dataId : `dict` or `DataCoordinate` 

1144 A `dict` of `Dimension` link name, value pairs that label the 

1145 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1146 should be provided as the second argument. 

1147 run : `str`, optional 

1148 The name of the run the dataset should be added to, overriding 

1149 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1150 **kwargs 

1151 Additional keyword arguments used to augment or construct a 

1152 `DataCoordinate`. See `DataCoordinate.standardize` 

1153 parameters. Not used if a resolve `DatasetRef` is provided. 

1154 

1155 Returns 

1156 ------- 

1157 ref : `DatasetRef` 

1158 A reference to the stored dataset, updated with the correct id if 

1159 given. 

1160 

1161 Raises 

1162 ------ 

1163 TypeError 

1164 Raised if the butler is read-only or if no run has been provided. 

1165 """ 

1166 if isinstance(datasetRefOrType, DatasetRef): 

1167 # This is a direct put of predefined DatasetRef. 

1168 log.debug("Butler put direct: %s", datasetRefOrType) 

1169 if run is not None: 

1170 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

1171 # If registry already has a dataset with the same dataset ID, 

1172 # dataset type and DataId, then _importDatasets will do nothing and 

1173 # just return an original ref. We have to raise in this case, there 

1174 # is a datastore check below for that. 

1175 self._registry._importDatasets([datasetRefOrType], expand=True) 

1176 # Before trying to write to the datastore check that it does not 

1177 # know this dataset. This is prone to races, of course. 

1178 if self._datastore.knows(datasetRefOrType): 

1179 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1180 # Try to write dataset to the datastore, if it fails due to a race 

1181 # with another write, the content of stored data may be 

1182 # unpredictable. 

1183 try: 

1184 self._datastore.put(obj, datasetRefOrType) 

1185 except IntegrityError as e: 

1186 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

1187 return datasetRefOrType 

1188 

1189 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1190 if not self.isWriteable(): 

1191 raise TypeError("Butler is read-only.") 

1192 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1193 

1194 # Handle dimension records in dataId 

1195 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1196 

1197 # Add Registry Dataset entry. 

1198 dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1199 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1200 self._datastore.put(obj, ref) 

1201 

1202 return ref 

1203 

1204 # TODO: remove on DM-40067. 

1205 @deprecated( 

1206 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1207 " Please use Butler.get(). Will be removed after v26.0.", 

1208 version="v26.0", 

1209 category=FutureWarning, 

1210 ) 

1211 def getDirect( 

1212 self, 

1213 ref: DatasetRef, 

1214 *, 

1215 parameters: dict[str, Any] | None = None, 

1216 storageClass: StorageClass | str | None = None, 

1217 ) -> Any: 

1218 """Retrieve a stored dataset. 

1219 

1220 Parameters 

1221 ---------- 

1222 ref : `DatasetRef` 

1223 Resolved reference to an already stored dataset. 

1224 parameters : `dict` 

1225 Additional StorageClass-defined options to control reading, 

1226 typically used to efficiently read only a subset of the dataset. 

1227 storageClass : `StorageClass` or `str`, optional 

1228 The storage class to be used to override the Python type 

1229 returned by this method. By default the returned type matches 

1230 the dataset type definition for this dataset. Specifying a 

1231 read `StorageClass` can force a different type to be returned. 

1232 This type must be compatible with the original type. 

1233 

1234 Returns 

1235 ------- 

1236 obj : `object` 

1237 The dataset. 

1238 """ 

1239 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1240 

1241 # TODO: remove on DM-40067. 

1242 @deprecated( 

1243 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1244 "Please use Butler.getDeferred(). Will be removed after v26.0.", 

1245 version="v26.0", 

1246 category=FutureWarning, 

1247 ) 

1248 def getDirectDeferred( 

1249 self, 

1250 ref: DatasetRef, 

1251 *, 

1252 parameters: dict | None = None, 

1253 storageClass: str | StorageClass | None = None, 

1254 ) -> DeferredDatasetHandle: 

1255 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1256 from a resolved `DatasetRef`. 

1257 

1258 Parameters 

1259 ---------- 

1260 ref : `DatasetRef` 

1261 Resolved reference to an already stored dataset. 

1262 parameters : `dict` 

1263 Additional StorageClass-defined options to control reading, 

1264 typically used to efficiently read only a subset of the dataset. 

1265 storageClass : `StorageClass` or `str`, optional 

1266 The storage class to be used to override the Python type 

1267 returned by this method. By default the returned type matches 

1268 the dataset type definition for this dataset. Specifying a 

1269 read `StorageClass` can force a different type to be returned. 

1270 This type must be compatible with the original type. 

1271 

1272 Returns 

1273 ------- 

1274 obj : `DeferredDatasetHandle` 

1275 A handle which can be used to retrieve a dataset at a later time. 

1276 

1277 Raises 

1278 ------ 

1279 LookupError 

1280 Raised if no matching dataset exists in the `Registry`. 

1281 """ 

1282 # Check that dataset is known to the datastore. 

1283 if not self._datastore.knows(ref): 

1284 raise LookupError(f"Dataset reference {ref} is not known to datastore.") 

1285 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1286 

1287 def getDeferred( 

1288 self, 

1289 datasetRefOrType: DatasetRef | DatasetType | str, 

1290 /, 

1291 dataId: DataId | None = None, 

1292 *, 

1293 parameters: dict | None = None, 

1294 collections: Any = None, 

1295 storageClass: str | StorageClass | None = None, 

1296 **kwargs: Any, 

1297 ) -> DeferredDatasetHandle: 

1298 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1299 after an immediate registry lookup. 

1300 

1301 Parameters 

1302 ---------- 

1303 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1304 When `DatasetRef` the `dataId` should be `None`. 

1305 Otherwise the `DatasetType` or name thereof. 

1306 dataId : `dict` or `DataCoordinate`, optional 

1307 A `dict` of `Dimension` link name, value pairs that label the 

1308 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1309 should be provided as the first argument. 

1310 parameters : `dict` 

1311 Additional StorageClass-defined options to control reading, 

1312 typically used to efficiently read only a subset of the dataset. 

1313 collections : Any, optional 

1314 Collections to be searched, overriding ``self.collections``. 

1315 Can be any of the types supported by the ``collections`` argument 

1316 to butler construction. 

1317 storageClass : `StorageClass` or `str`, optional 

1318 The storage class to be used to override the Python type 

1319 returned by this method. By default the returned type matches 

1320 the dataset type definition for this dataset. Specifying a 

1321 read `StorageClass` can force a different type to be returned. 

1322 This type must be compatible with the original type. 

1323 **kwargs 

1324 Additional keyword arguments used to augment or construct a 

1325 `DataId`. See `DataId` parameters. 

1326 

1327 Returns 

1328 ------- 

1329 obj : `DeferredDatasetHandle` 

1330 A handle which can be used to retrieve a dataset at a later time. 

1331 

1332 Raises 

1333 ------ 

1334 LookupError 

1335 Raised if no matching dataset exists in the `Registry` or 

1336 datastore. 

1337 ValueError 

1338 Raised if a resolved `DatasetRef` was passed as an input, but it 

1339 differs from the one found in the registry. 

1340 TypeError 

1341 Raised if no collections were provided. 

1342 """ 

1343 if isinstance(datasetRefOrType, DatasetRef): 

1344 # Do the quick check first and if that fails, check for artifact 

1345 # existence. This is necessary for datastores that are configured 

1346 # in trust mode where there won't be a record but there will be 

1347 # a file. 

1348 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1349 ref = datasetRefOrType 

1350 else: 

1351 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1352 else: 

1353 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1354 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1355 

1356 def get( 

1357 self, 

1358 datasetRefOrType: DatasetRef | DatasetType | str, 

1359 /, 

1360 dataId: DataId | None = None, 

1361 *, 

1362 parameters: dict[str, Any] | None = None, 

1363 collections: Any = None, 

1364 storageClass: StorageClass | str | None = None, 

1365 **kwargs: Any, 

1366 ) -> Any: 

1367 """Retrieve a stored dataset. 

1368 

1369 Parameters 

1370 ---------- 

1371 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1372 When `DatasetRef` the `dataId` should be `None`. 

1373 Otherwise the `DatasetType` or name thereof. 

1374 If a resolved `DatasetRef`, the associated dataset 

1375 is returned directly without additional querying. 

1376 dataId : `dict` or `DataCoordinate` 

1377 A `dict` of `Dimension` link name, value pairs that label the 

1378 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1379 should be provided as the first argument. 

1380 parameters : `dict` 

1381 Additional StorageClass-defined options to control reading, 

1382 typically used to efficiently read only a subset of the dataset. 

1383 collections : Any, optional 

1384 Collections to be searched, overriding ``self.collections``. 

1385 Can be any of the types supported by the ``collections`` argument 

1386 to butler construction. 

1387 storageClass : `StorageClass` or `str`, optional 

1388 The storage class to be used to override the Python type 

1389 returned by this method. By default the returned type matches 

1390 the dataset type definition for this dataset. Specifying a 

1391 read `StorageClass` can force a different type to be returned. 

1392 This type must be compatible with the original type. 

1393 **kwargs 

1394 Additional keyword arguments used to augment or construct a 

1395 `DataCoordinate`. See `DataCoordinate.standardize` 

1396 parameters. 

1397 

1398 Returns 

1399 ------- 

1400 obj : `object` 

1401 The dataset. 

1402 

1403 Raises 

1404 ------ 

1405 LookupError 

1406 Raised if no matching dataset exists in the `Registry`. 

1407 TypeError 

1408 Raised if no collections were provided. 

1409 

1410 Notes 

1411 ----- 

1412 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1413 this method requires that the given data ID include temporal dimensions 

1414 beyond the dimensions of the dataset type itself, in order to find the 

1415 dataset with the appropriate validity range. For example, a "bias" 

1416 dataset with native dimensions ``{instrument, detector}`` could be 

1417 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1418 ``exposure`` is a temporal dimension. 

1419 """ 

1420 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1421 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1422 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1423 

1424 def getURIs( 

1425 self, 

1426 datasetRefOrType: DatasetRef | DatasetType | str, 

1427 /, 

1428 dataId: DataId | None = None, 

1429 *, 

1430 predict: bool = False, 

1431 collections: Any = None, 

1432 run: str | None = None, 

1433 **kwargs: Any, 

1434 ) -> DatasetRefURIs: 

1435 """Return the URIs associated with the dataset. 

1436 

1437 Parameters 

1438 ---------- 

1439 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1440 When `DatasetRef` the `dataId` should be `None`. 

1441 Otherwise the `DatasetType` or name thereof. 

1442 dataId : `dict` or `DataCoordinate` 

1443 A `dict` of `Dimension` link name, value pairs that label the 

1444 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1445 should be provided as the first argument. 

1446 predict : `bool` 

1447 If `True`, allow URIs to be returned of datasets that have not 

1448 been written. 

1449 collections : Any, optional 

1450 Collections to be searched, overriding ``self.collections``. 

1451 Can be any of the types supported by the ``collections`` argument 

1452 to butler construction. 

1453 run : `str`, optional 

1454 Run to use for predictions, overriding ``self.run``. 

1455 **kwargs 

1456 Additional keyword arguments used to augment or construct a 

1457 `DataCoordinate`. See `DataCoordinate.standardize` 

1458 parameters. 

1459 

1460 Returns 

1461 ------- 

1462 uris : `DatasetRefURIs` 

1463 The URI to the primary artifact associated with this dataset (if 

1464 the dataset was disassembled within the datastore this may be 

1465 `None`), and the URIs to any components associated with the dataset 

1466 artifact. (can be empty if there are no components). 

1467 """ 

1468 ref = self._findDatasetRef( 

1469 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1470 ) 

1471 return self._datastore.getURIs(ref, predict) 

1472 

1473 def getURI( 

1474 self, 

1475 datasetRefOrType: DatasetRef | DatasetType | str, 

1476 /, 

1477 dataId: DataId | None = None, 

1478 *, 

1479 predict: bool = False, 

1480 collections: Any = None, 

1481 run: str | None = None, 

1482 **kwargs: Any, 

1483 ) -> ResourcePath: 

1484 """Return the URI to the Dataset. 

1485 

1486 Parameters 

1487 ---------- 

1488 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1489 When `DatasetRef` the `dataId` should be `None`. 

1490 Otherwise the `DatasetType` or name thereof. 

1491 dataId : `dict` or `DataCoordinate` 

1492 A `dict` of `Dimension` link name, value pairs that label the 

1493 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1494 should be provided as the first argument. 

1495 predict : `bool` 

1496 If `True`, allow URIs to be returned of datasets that have not 

1497 been written. 

1498 collections : Any, optional 

1499 Collections to be searched, overriding ``self.collections``. 

1500 Can be any of the types supported by the ``collections`` argument 

1501 to butler construction. 

1502 run : `str`, optional 

1503 Run to use for predictions, overriding ``self.run``. 

1504 **kwargs 

1505 Additional keyword arguments used to augment or construct a 

1506 `DataCoordinate`. See `DataCoordinate.standardize` 

1507 parameters. 

1508 

1509 Returns 

1510 ------- 

1511 uri : `lsst.resources.ResourcePath` 

1512 URI pointing to the Dataset within the datastore. If the 

1513 Dataset does not exist in the datastore, and if ``predict`` is 

1514 `True`, the URI will be a prediction and will include a URI 

1515 fragment "#predicted". 

1516 If the datastore does not have entities that relate well 

1517 to the concept of a URI the returned URI string will be 

1518 descriptive. The returned URI is not guaranteed to be obtainable. 

1519 

1520 Raises 

1521 ------ 

1522 LookupError 

1523 A URI has been requested for a dataset that does not exist and 

1524 guessing is not allowed. 

1525 ValueError 

1526 Raised if a resolved `DatasetRef` was passed as an input, but it 

1527 differs from the one found in the registry. 

1528 TypeError 

1529 Raised if no collections were provided. 

1530 RuntimeError 

1531 Raised if a URI is requested for a dataset that consists of 

1532 multiple artifacts. 

1533 """ 

1534 primary, components = self.getURIs( 

1535 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1536 ) 

1537 

1538 if primary is None or components: 

1539 raise RuntimeError( 

1540 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1541 "Use Butler.getURIs() instead." 

1542 ) 

1543 return primary 

1544 

1545 def retrieveArtifacts( 

1546 self, 

1547 refs: Iterable[DatasetRef], 

1548 destination: ResourcePathExpression, 

1549 transfer: str = "auto", 

1550 preserve_path: bool = True, 

1551 overwrite: bool = False, 

1552 ) -> list[ResourcePath]: 

1553 """Retrieve the artifacts associated with the supplied refs. 

1554 

1555 Parameters 

1556 ---------- 

1557 refs : iterable of `DatasetRef` 

1558 The datasets for which artifacts are to be retrieved. 

1559 A single ref can result in multiple artifacts. The refs must 

1560 be resolved. 

1561 destination : `lsst.resources.ResourcePath` or `str` 

1562 Location to write the artifacts. 

1563 transfer : `str`, optional 

1564 Method to use to transfer the artifacts. Must be one of the options 

1565 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1566 "move" is not allowed. 

1567 preserve_path : `bool`, optional 

1568 If `True` the full path of the artifact within the datastore 

1569 is preserved. If `False` the final file component of the path 

1570 is used. 

1571 overwrite : `bool`, optional 

1572 If `True` allow transfers to overwrite existing files at the 

1573 destination. 

1574 

1575 Returns 

1576 ------- 

1577 targets : `list` of `lsst.resources.ResourcePath` 

1578 URIs of file artifacts in destination location. Order is not 

1579 preserved. 

1580 

1581 Notes 

1582 ----- 

1583 For non-file datastores the artifacts written to the destination 

1584 may not match the representation inside the datastore. For example 

1585 a hierarchical data structure in a NoSQL database may well be stored 

1586 as a JSON file. 

1587 """ 

1588 return self._datastore.retrieveArtifacts( 

1589 refs, 

1590 ResourcePath(destination), 

1591 transfer=transfer, 

1592 preserve_path=preserve_path, 

1593 overwrite=overwrite, 

1594 ) 

1595 

1596 def exists( 

1597 self, 

1598 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1599 /, 

1600 data_id: DataId | None = None, 

1601 *, 

1602 full_check: bool = True, 

1603 collections: Any = None, 

1604 **kwargs: Any, 

1605 ) -> DatasetExistence: 

1606 """Indicate whether a dataset is known to Butler registry and 

1607 datastore. 

1608 

1609 Parameters 

1610 ---------- 

1611 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str` 

1612 When `DatasetRef` the `dataId` should be `None`. 

1613 Otherwise the `DatasetType` or name thereof. 

1614 data_id : `dict` or `DataCoordinate` 

1615 A `dict` of `Dimension` link name, value pairs that label the 

1616 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1617 should be provided as the first argument. 

1618 full_check : `bool`, optional 

1619 If `True`, an additional check will be made for dataset artifact 

1620 existence. This will involve additional overhead due to the need 

1621 to query an external system. If `False` registry and datastore 

1622 will solely be asked if they know about the dataset but no 

1623 check for the artifact will be performed. 

1624 collections : Any, optional 

1625 Collections to be searched, overriding ``self.collections``. 

1626 Can be any of the types supported by the ``collections`` argument 

1627 to butler construction. 

1628 **kwargs 

1629 Additional keyword arguments used to augment or construct a 

1630 `DataCoordinate`. See `DataCoordinate.standardize` 

1631 parameters. 

1632 

1633 Returns 

1634 ------- 

1635 existence : `DatasetExistence` 

1636 Object indicating whether the dataset is known to registry and 

1637 datastore. Evaluates to `True` if the dataset is present and known 

1638 to both. 

1639 """ 

1640 existence = DatasetExistence.UNRECOGNIZED 

1641 

1642 if isinstance(dataset_ref_or_type, DatasetRef): 

1643 if collections is not None: 

1644 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1645 if data_id is not None: 

1646 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1647 ref = dataset_ref_or_type 

1648 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1649 if registry_ref is not None: 

1650 existence |= DatasetExistence.RECORDED 

1651 

1652 if dataset_ref_or_type != registry_ref: 

1653 # This could mean that storage classes differ, so we should 

1654 # check for that but use the registry ref for the rest of 

1655 # the method. 

1656 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1657 # Use the registry version from now on. 

1658 ref = registry_ref 

1659 else: 

1660 raise ValueError( 

1661 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1662 f"in registry but has different incompatible values ({registry_ref})." 

1663 ) 

1664 else: 

1665 try: 

1666 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1667 except (LookupError, TypeError, NoDefaultCollectionError): 

1668 return existence 

1669 existence |= DatasetExistence.RECORDED 

1670 

1671 if self._datastore.knows(ref): 

1672 existence |= DatasetExistence.DATASTORE 

1673 

1674 if full_check: 

1675 if self._datastore.exists(ref): 

1676 existence |= DatasetExistence._ARTIFACT 

1677 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1678 # Do not add this flag if we have no other idea about a dataset. 

1679 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1680 

1681 return existence 

1682 

1683 def _exists_many( 

1684 self, 

1685 refs: Iterable[DatasetRef], 

1686 /, 

1687 *, 

1688 full_check: bool = True, 

1689 ) -> dict[DatasetRef, DatasetExistence]: 

1690 """Indicate whether multiple datasets are known to Butler registry and 

1691 datastore. 

1692 

1693 This is an experimental API that may change at any moment. 

1694 

1695 Parameters 

1696 ---------- 

1697 refs : iterable of `DatasetRef` 

1698 The datasets to be checked. 

1699 full_check : `bool`, optional 

1700 If `True`, an additional check will be made for dataset artifact 

1701 existence. This will involve additional overhead due to the need 

1702 to query an external system. If `False` registry and datastore 

1703 will solely be asked if they know about the dataset but no 

1704 check for the artifact will be performed. 

1705 

1706 Returns 

1707 ------- 

1708 existence : dict of [`DatasetRef`, `DatasetExistence`] 

1709 Mapping from the given dataset refs to an enum indicating the 

1710 status of the dataset in registry and datastore. 

1711 Each value evaluates to `True` if the dataset is present and known 

1712 to both. 

1713 """ 

1714 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1715 

1716 # Registry does not have a bulk API to check for a ref. 

1717 for ref in refs: 

1718 registry_ref = self._registry.getDataset(ref.id) 

1719 if registry_ref is not None: 

1720 # It is possible, albeit unlikely, that the given ref does 

1721 # not match the one in registry even though the UUID matches. 

1722 # When checking a single ref we raise, but it's impolite to 

1723 # do that when potentially hundreds of refs are being checked. 

1724 # We could change the API to only accept UUIDs and that would 

1725 # remove the ability to even check and remove the worry 

1726 # about differing storage classes. Given the ongoing discussion 

1727 # on refs vs UUIDs and whether to raise or have a new 

1728 # private flag, treat this as a private API for now. 

1729 existence[ref] |= DatasetExistence.RECORDED 

1730 

1731 # Ask datastore if it knows about these refs. 

1732 knows = self._datastore.knows_these(refs) 

1733 for ref, known in knows.items(): 

1734 if known: 

1735 existence[ref] |= DatasetExistence.DATASTORE 

1736 

1737 if full_check: 

1738 mexists = self._datastore.mexists(refs) 

1739 for ref, exists in mexists.items(): 

1740 if exists: 

1741 existence[ref] |= DatasetExistence._ARTIFACT 

1742 else: 

1743 # Do not set this flag if nothing is known about the dataset. 

1744 for ref in existence: 

1745 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1746 existence[ref] |= DatasetExistence._ASSUMED 

1747 

1748 return existence 

1749 

1750 # TODO: remove on DM-40079. 

1751 @deprecated( 

1752 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.", 

1753 version="v26.0", 

1754 category=FutureWarning, 

1755 ) 

1756 def datasetExists( 

1757 self, 

1758 datasetRefOrType: DatasetRef | DatasetType | str, 

1759 dataId: DataId | None = None, 

1760 *, 

1761 collections: Any = None, 

1762 **kwargs: Any, 

1763 ) -> bool: 

1764 """Return True if the Dataset is actually present in the Datastore. 

1765 

1766 Parameters 

1767 ---------- 

1768 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1769 When `DatasetRef` the `dataId` should be `None`. 

1770 Otherwise the `DatasetType` or name thereof. 

1771 dataId : `dict` or `DataCoordinate` 

1772 A `dict` of `Dimension` link name, value pairs that label the 

1773 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1774 should be provided as the first argument. 

1775 collections : Any, optional 

1776 Collections to be searched, overriding ``self.collections``. 

1777 Can be any of the types supported by the ``collections`` argument 

1778 to butler construction. 

1779 **kwargs 

1780 Additional keyword arguments used to augment or construct a 

1781 `DataCoordinate`. See `DataCoordinate.standardize` 

1782 parameters. 

1783 

1784 Raises 

1785 ------ 

1786 LookupError 

1787 Raised if the dataset is not even present in the Registry. 

1788 ValueError 

1789 Raised if a resolved `DatasetRef` was passed as an input, but it 

1790 differs from the one found in the registry. 

1791 NoDefaultCollectionError 

1792 Raised if no collections were provided. 

1793 """ 

1794 # A resolved ref may be given that is not known to this butler. 

1795 if isinstance(datasetRefOrType, DatasetRef): 

1796 ref = self._registry.getDataset(datasetRefOrType.id) 

1797 if ref is None: 

1798 raise LookupError( 

1799 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1800 ) 

1801 else: 

1802 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1803 return self._datastore.exists(ref) 

1804 

1805 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1806 """Remove one or more `~CollectionType.RUN` collections and the 

1807 datasets within them. 

1808 

1809 Parameters 

1810 ---------- 

1811 names : `~collections.abc.Iterable` [ `str` ] 

1812 The names of the collections to remove. 

1813 unstore : `bool`, optional 

1814 If `True` (default), delete datasets from all datastores in which 

1815 they are present, and attempt to rollback the registry deletions if 

1816 datastore deletions fail (which may not always be possible). If 

1817 `False`, datastore records for these datasets are still removed, 

1818 but any artifacts (e.g. files) will not be. 

1819 

1820 Raises 

1821 ------ 

1822 TypeError 

1823 Raised if one or more collections are not of type 

1824 `~CollectionType.RUN`. 

1825 """ 

1826 if not self.isWriteable(): 

1827 raise TypeError("Butler is read-only.") 

1828 names = list(names) 

1829 refs: list[DatasetRef] = [] 

1830 for name in names: 

1831 collectionType = self._registry.getCollectionType(name) 

1832 if collectionType is not CollectionType.RUN: 

1833 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1834 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1835 with self._datastore.transaction(), self._registry.transaction(): 

1836 if unstore: 

1837 self._datastore.trash(refs) 

1838 else: 

1839 self._datastore.forget(refs) 

1840 for name in names: 

1841 self._registry.removeCollection(name) 

1842 if unstore: 

1843 # Point of no return for removing artifacts 

1844 self._datastore.emptyTrash() 

1845 

1846 def pruneDatasets( 

1847 self, 

1848 refs: Iterable[DatasetRef], 

1849 *, 

1850 disassociate: bool = True, 

1851 unstore: bool = False, 

1852 tags: Iterable[str] = (), 

1853 purge: bool = False, 

1854 ) -> None: 

1855 # docstring inherited from LimitedButler 

1856 

1857 if not self.isWriteable(): 

1858 raise TypeError("Butler is read-only.") 

1859 if purge: 

1860 if not disassociate: 

1861 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1862 if not unstore: 

1863 raise TypeError("Cannot pass purge=True without unstore=True.") 

1864 elif disassociate: 

1865 tags = tuple(tags) 

1866 if not tags: 

1867 raise TypeError("No tags provided but disassociate=True.") 

1868 for tag in tags: 

1869 collectionType = self._registry.getCollectionType(tag) 

1870 if collectionType is not CollectionType.TAGGED: 

1871 raise TypeError( 

1872 f"Cannot disassociate from collection '{tag}' " 

1873 f"of non-TAGGED type {collectionType.name}." 

1874 ) 

1875 # Transform possibly-single-pass iterable into something we can iterate 

1876 # over multiple times. 

1877 refs = list(refs) 

1878 # Pruning a component of a DatasetRef makes no sense since registry 

1879 # doesn't know about components and datastore might not store 

1880 # components in a separate file 

1881 for ref in refs: 

1882 if ref.datasetType.component(): 

1883 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1884 # We don't need an unreliable Datastore transaction for this, because 

1885 # we've been extra careful to ensure that Datastore.trash only involves 

1886 # mutating the Registry (it can _look_ at Datastore-specific things, 

1887 # but shouldn't change them), and hence all operations here are 

1888 # Registry operations. 

1889 with self._datastore.transaction(), self._registry.transaction(): 

1890 if unstore: 

1891 self._datastore.trash(refs) 

1892 if purge: 

1893 self._registry.removeDatasets(refs) 

1894 elif disassociate: 

1895 assert tags, "Guaranteed by earlier logic in this function." 

1896 for tag in tags: 

1897 self._registry.disassociate(tag, refs) 

1898 # We've exited the Registry transaction, and apparently committed. 

1899 # (if there was an exception, everything rolled back, and it's as if 

1900 # nothing happened - and we never get here). 

1901 # Datastore artifacts are not yet gone, but they're clearly marked 

1902 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1903 # problems we can try again later, and if manual administrative 

1904 # intervention is required, it's pretty clear what that should entail: 

1905 # deleting everything on disk and in private Datastore tables that is 

1906 # in the dataset_location_trash table. 

1907 if unstore: 

1908 # Point of no return for removing artifacts 

1909 self._datastore.emptyTrash() 

1910 

1911 @transactional 

1912 def ingest( 

1913 self, 

1914 *datasets: FileDataset, 

1915 transfer: str | None = "auto", 

1916 run: str | None = None, 

1917 idGenerationMode: DatasetIdGenEnum | None = None, 

1918 record_validation_info: bool = True, 

1919 ) -> None: 

1920 """Store and register one or more datasets that already exist on disk. 

1921 

1922 Parameters 

1923 ---------- 

1924 datasets : `FileDataset` 

1925 Each positional argument is a struct containing information about 

1926 a file to be ingested, including its URI (either absolute or 

1927 relative to the datastore root, if applicable), a resolved 

1928 `DatasetRef`, and optionally a formatter class or its 

1929 fully-qualified string name. If a formatter is not provided, the 

1930 formatter that would be used for `put` is assumed. On successful 

1931 ingest all `FileDataset.formatter` attributes will be set to the 

1932 formatter class used. `FileDataset.path` attributes may be modified 

1933 to put paths in whatever the datastore considers a standardized 

1934 form. 

1935 transfer : `str`, optional 

1936 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1937 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1938 transfer the file. 

1939 run : `str`, optional 

1940 The name of the run ingested datasets should be added to, 

1941 overriding ``self.run``. This parameter is now deprecated since 

1942 the run is encoded in the ``FileDataset``. 

1943 idGenerationMode : `DatasetIdGenEnum`, optional 

1944 Specifies option for generating dataset IDs. Parameter is 

1945 deprecated. 

1946 record_validation_info : `bool`, optional 

1947 If `True`, the default, the datastore can record validation 

1948 information associated with the file. If `False` the datastore 

1949 will not attempt to track any information such as checksums 

1950 or file sizes. This can be useful if such information is tracked 

1951 in an external system or if the file is to be compressed in place. 

1952 It is up to the datastore whether this parameter is relevant. 

1953 

1954 Raises 

1955 ------ 

1956 TypeError 

1957 Raised if the butler is read-only or if no run was provided. 

1958 NotImplementedError 

1959 Raised if the `Datastore` does not support the given transfer mode. 

1960 DatasetTypeNotSupportedError 

1961 Raised if one or more files to be ingested have a dataset type that 

1962 is not supported by the `Datastore`.. 

1963 FileNotFoundError 

1964 Raised if one of the given files does not exist. 

1965 FileExistsError 

1966 Raised if transfer is not `None` but the (internal) location the 

1967 file would be moved to is already occupied. 

1968 

1969 Notes 

1970 ----- 

1971 This operation is not fully exception safe: if a database operation 

1972 fails, the given `FileDataset` instances may be only partially updated. 

1973 

1974 It is atomic in terms of database operations (they will either all 

1975 succeed or all fail) providing the database engine implements 

1976 transactions correctly. It will attempt to be atomic in terms of 

1977 filesystem operations as well, but this cannot be implemented 

1978 rigorously for most datastores. 

1979 """ 

1980 if not self.isWriteable(): 

1981 raise TypeError("Butler is read-only.") 

1982 

1983 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1984 if not datasets: 

1985 return 

1986 

1987 if idGenerationMode is not None: 

1988 warnings.warn( 

1989 "The idGenerationMode parameter is no longer used and is ignored. " 

1990 " Will be removed after v26.0", 

1991 FutureWarning, 

1992 stacklevel=2, 

1993 ) 

1994 

1995 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1996 

1997 # We need to reorganize all the inputs so that they are grouped 

1998 # by dataset type and run. Multiple refs in a single FileDataset 

1999 # are required to share the run and dataset type. 

2000 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

2001 groupedData: GroupedData = defaultdict(list) 

2002 

2003 # Track DataIDs that are being ingested so we can spot issues early 

2004 # with duplication. Retain previous FileDataset so we can report it. 

2005 groupedDataIds: MutableMapping[ 

2006 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

2007 ] = defaultdict(dict) 

2008 

2009 used_run = False 

2010 

2011 # And the nested loop that populates it: 

2012 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

2013 # Somewhere to store pre-existing refs if we have an 

2014 # execution butler. 

2015 existingRefs: list[DatasetRef] = [] 

2016 

2017 for ref in dataset.refs: 

2018 assert ref.run is not None # For mypy 

2019 group_key = (ref.datasetType, ref.run) 

2020 

2021 if ref.dataId in groupedDataIds[group_key]: 

2022 raise ConflictingDefinitionError( 

2023 f"Ingest conflict. Dataset {dataset.path} has same" 

2024 " DataId as other ingest dataset" 

2025 f" {groupedDataIds[group_key][ref.dataId].path} " 

2026 f" ({ref.dataId})" 

2027 ) 

2028 

2029 groupedDataIds[group_key][ref.dataId] = dataset 

2030 

2031 if existingRefs: 

2032 if len(dataset.refs) != len(existingRefs): 

2033 # Keeping track of partially pre-existing datasets is hard 

2034 # and should generally never happen. For now don't allow 

2035 # it. 

2036 raise ConflictingDefinitionError( 

2037 f"For dataset {dataset.path} some dataIds already exist" 

2038 " in registry but others do not. This is not supported." 

2039 ) 

2040 

2041 # Store expanded form in the original FileDataset. 

2042 dataset.refs = existingRefs 

2043 else: 

2044 groupedData[group_key].append(dataset) 

2045 

2046 if not used_run and run is not None: 

2047 warnings.warn( 

2048 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

2049 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

2050 category=FutureWarning, 

2051 stacklevel=3, # Take into account the @transactional decorator. 

2052 ) 

2053 

2054 # Now we can bulk-insert into Registry for each DatasetType. 

2055 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

2056 groupedData.items(), desc="Bulk-inserting datasets by type" 

2057 ): 

2058 refs_to_import = [] 

2059 for dataset in grouped_datasets: 

2060 refs_to_import.extend(dataset.refs) 

2061 

2062 n_refs = len(refs_to_import) 

2063 log.verbose( 

2064 "Importing %d ref%s of dataset type %r into run %r", 

2065 n_refs, 

2066 "" if n_refs == 1 else "s", 

2067 datasetType.name, 

2068 this_run, 

2069 ) 

2070 

2071 # Import the refs and expand the DataCoordinates since we can't 

2072 # guarantee that they are expanded and Datastore will need 

2073 # the records. 

2074 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

2075 assert set(imported_refs) == set(refs_to_import) 

2076 

2077 # Replace all the refs in the FileDataset with expanded versions. 

2078 # Pull them off in the order we put them on the list. 

2079 for dataset in grouped_datasets: 

2080 n_dataset_refs = len(dataset.refs) 

2081 dataset.refs = imported_refs[:n_dataset_refs] 

2082 del imported_refs[:n_dataset_refs] 

2083 

2084 # Bulk-insert everything into Datastore. 

2085 # We do not know if any of the registry entries already existed 

2086 # (_importDatasets only complains if they exist but differ) so 

2087 # we have to catch IntegrityError explicitly. 

2088 try: 

2089 self._datastore.ingest( 

2090 *datasets, transfer=transfer, record_validation_info=record_validation_info 

2091 ) 

2092 except IntegrityError as e: 

2093 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

2094 

2095 @contextlib.contextmanager 

2096 def export( 

2097 self, 

2098 *, 

2099 directory: str | None = None, 

2100 filename: str | None = None, 

2101 format: str | None = None, 

2102 transfer: str | None = None, 

2103 ) -> Iterator[RepoExportContext]: 

2104 """Export datasets from the repository represented by this `Butler`. 

2105 

2106 This method is a context manager that returns a helper object 

2107 (`RepoExportContext`) that is used to indicate what information from 

2108 the repository should be exported. 

2109 

2110 Parameters 

2111 ---------- 

2112 directory : `str`, optional 

2113 Directory dataset files should be written to if ``transfer`` is not 

2114 `None`. 

2115 filename : `str`, optional 

2116 Name for the file that will include database information associated 

2117 with the exported datasets. If this is not an absolute path and 

2118 ``directory`` is not `None`, it will be written to ``directory`` 

2119 instead of the current working directory. Defaults to 

2120 "export.{format}". 

2121 format : `str`, optional 

2122 File format for the database information file. If `None`, the 

2123 extension of ``filename`` will be used. 

2124 transfer : `str`, optional 

2125 Transfer mode passed to `Datastore.export`. 

2126 

2127 Raises 

2128 ------ 

2129 TypeError 

2130 Raised if the set of arguments passed is inconsistent. 

2131 

2132 Examples 

2133 -------- 

2134 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2135 methods are used to provide the iterables over data IDs and/or datasets 

2136 to be exported:: 

2137 

2138 with butler.export("exports.yaml") as export: 

2139 # Export all flats, but none of the dimension element rows 

2140 # (i.e. data ID information) associated with them. 

2141 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2142 elements=()) 

2143 # Export all datasets that start with "deepCoadd_" and all of 

2144 # their associated data ID information. 

2145 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2146 """ 

2147 if directory is None and transfer is not None: 

2148 raise TypeError("Cannot transfer without providing a directory.") 

2149 if transfer == "move": 

2150 raise TypeError("Transfer may not be 'move': export is read-only") 

2151 if format is None: 

2152 if filename is None: 

2153 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2154 else: 

2155 _, format = os.path.splitext(filename) 

2156 if not format: 

2157 raise ValueError("Please specify a file extension to determine export format.") 

2158 format = format[1:] # Strip leading "."" 

2159 elif filename is None: 

2160 filename = f"export.{format}" 

2161 if directory is not None: 

2162 filename = os.path.join(directory, filename) 

2163 formats = self._config["repo_transfer_formats"] 

2164 if format not in formats: 

2165 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2166 BackendClass = get_class_of(formats[format, "export"]) 

2167 with open(filename, "w") as stream: 

2168 backend = BackendClass(stream, universe=self.dimensions) 

2169 try: 

2170 helper = RepoExportContext( 

2171 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

2172 ) 

2173 yield helper 

2174 except BaseException: 

2175 raise 

2176 else: 

2177 helper._finish() 

2178 

2179 def import_( 

2180 self, 

2181 *, 

2182 directory: ResourcePathExpression | None = None, 

2183 filename: ResourcePathExpression | TextIO | None = None, 

2184 format: str | None = None, 

2185 transfer: str | None = None, 

2186 skip_dimensions: set | None = None, 

2187 ) -> None: 

2188 """Import datasets into this repository that were exported from a 

2189 different butler repository via `~lsst.daf.butler.Butler.export`. 

2190 

2191 Parameters 

2192 ---------- 

2193 directory : `~lsst.resources.ResourcePathExpression`, optional 

2194 Directory containing dataset files to import from. If `None`, 

2195 ``filename`` and all dataset file paths specified therein must 

2196 be absolute. 

2197 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2198 A stream or name of file that contains database information 

2199 associated with the exported datasets, typically generated by 

2200 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2201 `~lsst.resources.ResourcePath` and is not an absolute path, 

2202 it will first be looked for relative to ``directory`` and if not 

2203 found there it will be looked for in the current working 

2204 directory. Defaults to "export.{format}". 

2205 format : `str`, optional 

2206 File format for ``filename``. If `None`, the extension of 

2207 ``filename`` will be used. 

2208 transfer : `str`, optional 

2209 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2210 skip_dimensions : `set`, optional 

2211 Names of dimensions that should be skipped and not imported. 

2212 

2213 Raises 

2214 ------ 

2215 TypeError 

2216 Raised if the set of arguments passed is inconsistent, or if the 

2217 butler is read-only. 

2218 """ 

2219 if not self.isWriteable(): 

2220 raise TypeError("Butler is read-only.") 

2221 if format is None: 

2222 if filename is None: 

2223 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2224 else: 

2225 _, format = os.path.splitext(filename) # type: ignore 

2226 elif filename is None: 

2227 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2228 if directory is not None: 

2229 directory = ResourcePath(directory, forceDirectory=True) 

2230 # mypy doesn't think this will work but it does in python >= 3.10. 

2231 if isinstance(filename, ResourcePathExpression): # type: ignore 

2232 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2233 if not filename.isabs() and directory is not None: 

2234 potential = directory.join(filename) 

2235 exists_in_cwd = filename.exists() 

2236 exists_in_dir = potential.exists() 

2237 if exists_in_cwd and exists_in_dir: 

2238 log.warning( 

2239 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2240 "Additionally, the file exists relative to the given search directory (%s). " 

2241 "Using the export file in the given directory.", 

2242 filename, 

2243 potential, 

2244 ) 

2245 # Given they specified an explicit directory and that 

2246 # directory has the export file in it, assume that that 

2247 # is what was meant despite the file in cwd. 

2248 filename = potential 

2249 elif exists_in_dir: 

2250 filename = potential 

2251 elif not exists_in_cwd and not exists_in_dir: 

2252 # Raise early. 

2253 raise FileNotFoundError( 

2254 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2255 ) 

2256 BackendClass: type[RepoImportBackend] = get_class_of( 

2257 self._config["repo_transfer_formats"][format]["import"] 

2258 ) 

2259 

2260 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2261 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

2262 backend.register() 

2263 with self.transaction(): 

2264 backend.load( 

2265 self._datastore, 

2266 directory=directory, 

2267 transfer=transfer, 

2268 skip_dimensions=skip_dimensions, 

2269 ) 

2270 

2271 if isinstance(filename, ResourcePath): 

2272 # We can not use open() here at the moment because of 

2273 # DM-38589 since yaml does stream.read(8192) in a loop. 

2274 stream = io.StringIO(filename.read().decode()) 

2275 doImport(stream) 

2276 else: 

2277 doImport(filename) # type: ignore 

2278 

2279 def transfer_from( 

2280 self, 

2281 source_butler: LimitedButler, 

2282 source_refs: Iterable[DatasetRef], 

2283 transfer: str = "auto", 

2284 skip_missing: bool = True, 

2285 register_dataset_types: bool = False, 

2286 transfer_dimensions: bool = False, 

2287 ) -> collections.abc.Collection[DatasetRef]: 

2288 """Transfer datasets to this Butler from a run in another Butler. 

2289 

2290 Parameters 

2291 ---------- 

2292 source_butler : `LimitedButler` 

2293 Butler from which the datasets are to be transferred. If data IDs 

2294 in ``source_refs`` are not expanded then this has to be a full 

2295 `Butler` whose registry will be used to expand data IDs. 

2296 source_refs : iterable of `DatasetRef` 

2297 Datasets defined in the source butler that should be transferred to 

2298 this butler. 

2299 transfer : `str`, optional 

2300 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2301 skip_missing : `bool` 

2302 If `True`, datasets with no datastore artifact associated with 

2303 them are not transferred. If `False` a registry entry will be 

2304 created even if no datastore record is created (and so will 

2305 look equivalent to the dataset being unstored). 

2306 register_dataset_types : `bool` 

2307 If `True` any missing dataset types are registered. Otherwise 

2308 an exception is raised. 

2309 transfer_dimensions : `bool`, optional 

2310 If `True`, dimension record data associated with the new datasets 

2311 will be transferred. 

2312 

2313 Returns 

2314 ------- 

2315 refs : `list` of `DatasetRef` 

2316 The refs added to this Butler. 

2317 

2318 Notes 

2319 ----- 

2320 The datastore artifact has to exist for a transfer 

2321 to be made but non-existence is not an error. 

2322 

2323 Datasets that already exist in this run will be skipped. 

2324 

2325 The datasets are imported as part of a transaction, although 

2326 dataset types are registered before the transaction is started. 

2327 This means that it is possible for a dataset type to be registered 

2328 even though transfer has failed. 

2329 """ 

2330 if not self.isWriteable(): 

2331 raise TypeError("Butler is read-only.") 

2332 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2333 

2334 # Will iterate through the refs multiple times so need to convert 

2335 # to a list if this isn't a collection. 

2336 if not isinstance(source_refs, collections.abc.Collection): 

2337 source_refs = list(source_refs) 

2338 

2339 original_count = len(source_refs) 

2340 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2341 

2342 # In some situations the datastore artifact may be missing 

2343 # and we do not want that registry entry to be imported. 

2344 # Asking datastore is not sufficient, the records may have been 

2345 # purged, we have to ask for the (predicted) URI and check 

2346 # existence explicitly. Execution butler is set up exactly like 

2347 # this with no datastore records. 

2348 artifact_existence: dict[ResourcePath, bool] = {} 

2349 if skip_missing: 

2350 dataset_existence = source_butler._datastore.mexists( 

2351 source_refs, artifact_existence=artifact_existence 

2352 ) 

2353 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2354 filtered_count = len(source_refs) 

2355 n_missing = original_count - filtered_count 

2356 log.verbose( 

2357 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2358 n_missing, 

2359 "" if n_missing == 1 else "s", 

2360 filtered_count, 

2361 ) 

2362 

2363 # Importing requires that we group the refs by dataset type and run 

2364 # before doing the import. 

2365 source_dataset_types = set() 

2366 grouped_refs = defaultdict(list) 

2367 for ref in source_refs: 

2368 grouped_refs[ref.datasetType, ref.run].append(ref) 

2369 source_dataset_types.add(ref.datasetType) 

2370 

2371 # Check to see if the dataset type in the source butler has 

2372 # the same definition in the target butler and register missing 

2373 # ones if requested. Registration must happen outside a transaction. 

2374 newly_registered_dataset_types = set() 

2375 for datasetType in source_dataset_types: 

2376 if register_dataset_types: 

2377 # Let this raise immediately if inconsistent. Continuing 

2378 # on to find additional inconsistent dataset types 

2379 # might result in additional unwanted dataset types being 

2380 # registered. 

2381 if self._registry.registerDatasetType(datasetType): 

2382 newly_registered_dataset_types.add(datasetType) 

2383 else: 

2384 # If the dataset type is missing, let it fail immediately. 

2385 target_dataset_type = self._registry.getDatasetType(datasetType.name) 

2386 if target_dataset_type != datasetType: 

2387 raise ConflictingDefinitionError( 

2388 "Source butler dataset type differs from definition" 

2389 f" in target butler: {datasetType} !=" 

2390 f" {target_dataset_type}" 

2391 ) 

2392 if newly_registered_dataset_types: 

2393 # We may have registered some even if there were inconsistencies 

2394 # but should let people know (or else remove them again). 

2395 log.log( 

2396 VERBOSE, 

2397 "Registered the following dataset types in the target Butler: %s", 

2398 ", ".join(d.name for d in newly_registered_dataset_types), 

2399 ) 

2400 else: 

2401 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2402 

2403 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2404 if transfer_dimensions: 

2405 # Collect all the dimension records for these refs. 

2406 # All dimensions are to be copied but the list of valid dimensions 

2407 # come from this butler's universe. 

2408 elements = frozenset( 

2409 element 

2410 for element in self.dimensions.getStaticElements() 

2411 if element.hasTable() and element.viewOf is None 

2412 ) 

2413 dataIds = {ref.dataId for ref in source_refs} 

2414 # This logic comes from saveDataIds. 

2415 for dataId in dataIds: 

2416 # Need an expanded record, if not expanded that we need a full 

2417 # butler with registry (allow mocks with registry too). 

2418 if not dataId.hasRecords(): 

2419 if registry := getattr(source_butler, "registry", None): 

2420 dataId = registry.expandDataId(dataId) 

2421 else: 

2422 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2423 # If this butler doesn't know about a dimension in the source 

2424 # butler things will break later. 

2425 for record in dataId.records.values(): 

2426 if record is not None and record.definition in elements: 

2427 dimension_records[record.definition].setdefault(record.dataId, record) 

2428 

2429 handled_collections: set[str] = set() 

2430 

2431 # Do all the importing in a single transaction. 

2432 with self.transaction(): 

2433 if dimension_records: 

2434 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2435 for element, r in dimension_records.items(): 

2436 records = [r[dataId] for dataId in r] 

2437 # Assume that if the record is already present that we can 

2438 # use it without having to check that the record metadata 

2439 # is consistent. 

2440 self._registry.insertDimensionData(element, *records, skip_existing=True) 

2441 

2442 n_imported = 0 

2443 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2444 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2445 ): 

2446 if run not in handled_collections: 

2447 # May need to create output collection. If source butler 

2448 # has a registry, ask for documentation string. 

2449 run_doc = None 

2450 if registry := getattr(source_butler, "registry", None): 

2451 run_doc = registry.getCollectionDocumentation(run) 

2452 registered = self._registry.registerRun(run, doc=run_doc) 

2453 handled_collections.add(run) 

2454 if registered: 

2455 log.log(VERBOSE, "Creating output run %s", run) 

2456 

2457 n_refs = len(refs_to_import) 

2458 log.verbose( 

2459 "Importing %d ref%s of dataset type %s into run %s", 

2460 n_refs, 

2461 "" if n_refs == 1 else "s", 

2462 datasetType.name, 

2463 run, 

2464 ) 

2465 

2466 # Assume we are using UUIDs and the source refs will match 

2467 # those imported. 

2468 imported_refs = self._registry._importDatasets(refs_to_import, expand=False) 

2469 assert set(imported_refs) == set(refs_to_import) 

2470 n_imported += len(imported_refs) 

2471 

2472 assert len(source_refs) == n_imported 

2473 log.verbose("Imported %d datasets into destination butler", n_imported) 

2474 

2475 # Ask the datastore to transfer. The datastore has to check that 

2476 # the source datastore is compatible with the target datastore. 

2477 accepted, rejected = self._datastore.transfer_from( 

2478 source_butler._datastore, 

2479 source_refs, 

2480 transfer=transfer, 

2481 artifact_existence=artifact_existence, 

2482 ) 

2483 if rejected: 

2484 # For now, accept the registry entries but not the files. 

2485 log.warning( 

2486 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2487 len(rejected), 

2488 len(accepted), 

2489 datasetType, 

2490 run, 

2491 ) 

2492 

2493 return source_refs 

2494 

2495 def validateConfiguration( 

2496 self, 

2497 logFailures: bool = False, 

2498 datasetTypeNames: Iterable[str] | None = None, 

2499 ignore: Iterable[str] | None = None, 

2500 ) -> None: 

2501 """Validate butler configuration. 

2502 

2503 Checks that each `DatasetType` can be stored in the `Datastore`. 

2504 

2505 Parameters 

2506 ---------- 

2507 logFailures : `bool`, optional 

2508 If `True`, output a log message for every validation error 

2509 detected. 

2510 datasetTypeNames : iterable of `str`, optional 

2511 The `DatasetType` names that should be checked. This allows 

2512 only a subset to be selected. 

2513 ignore : iterable of `str`, optional 

2514 Names of DatasetTypes to skip over. This can be used to skip 

2515 known problems. If a named `DatasetType` corresponds to a 

2516 composite, all components of that `DatasetType` will also be 

2517 ignored. 

2518 

2519 Raises 

2520 ------ 

2521 ButlerValidationError 

2522 Raised if there is some inconsistency with how this Butler 

2523 is configured. 

2524 """ 

2525 if datasetTypeNames: 

2526 datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames] 

2527 else: 

2528 datasetTypes = list(self._registry.queryDatasetTypes()) 

2529 

2530 # filter out anything from the ignore list 

2531 if ignore: 

2532 ignore = set(ignore) 

2533 datasetTypes = [ 

2534 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2535 ] 

2536 else: 

2537 ignore = set() 

2538 

2539 # For each datasetType that has an instrument dimension, create 

2540 # a DatasetRef for each defined instrument 

2541 datasetRefs = [] 

2542 

2543 # Find all the registered instruments (if "instrument" is in the 

2544 # universe). 

2545 if "instrument" in self.dimensions: 

2546 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2547 

2548 for datasetType in datasetTypes: 

2549 if "instrument" in datasetType.dimensions: 

2550 # In order to create a conforming dataset ref, create 

2551 # fake DataCoordinate values for the non-instrument 

2552 # dimensions. The type of the value does not matter here. 

2553 dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"} 

2554 

2555 for instrument in instruments: 

2556 datasetRef = DatasetRef( 

2557 datasetType, 

2558 DataCoordinate.standardize( 

2559 dataId, instrument=instrument, graph=datasetType.dimensions 

2560 ), 

2561 run="validate", 

2562 ) 

2563 datasetRefs.append(datasetRef) 

2564 

2565 entities: list[DatasetType | DatasetRef] = [] 

2566 entities.extend(datasetTypes) 

2567 entities.extend(datasetRefs) 

2568 

2569 datastoreErrorStr = None 

2570 try: 

2571 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2572 except ValidationError as e: 

2573 datastoreErrorStr = str(e) 

2574 

2575 # Also check that the LookupKeys used by the datastores match 

2576 # registry and storage class definitions 

2577 keys = self._datastore.getLookupKeys() 

2578 

2579 failedNames = set() 

2580 failedDataId = set() 

2581 for key in keys: 

2582 if key.name is not None: 

2583 if key.name in ignore: 

2584 continue 

2585 

2586 # skip if specific datasetType names were requested and this 

2587 # name does not match 

2588 if datasetTypeNames and key.name not in datasetTypeNames: 

2589 continue 

2590 

2591 # See if it is a StorageClass or a DatasetType 

2592 if key.name in self.storageClasses: 

2593 pass 

2594 else: 

2595 try: 

2596 self._registry.getDatasetType(key.name) 

2597 except KeyError: 

2598 if logFailures: 

2599 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2600 failedNames.add(key) 

2601 else: 

2602 # Dimensions are checked for consistency when the Butler 

2603 # is created and rendezvoused with a universe. 

2604 pass 

2605 

2606 # Check that the instrument is a valid instrument 

2607 # Currently only support instrument so check for that 

2608 if key.dataId: 

2609 dataIdKeys = set(key.dataId) 

2610 if {"instrument"} != dataIdKeys: 

2611 if logFailures: 

2612 log.critical("Key '%s' has unsupported DataId override", key) 

2613 failedDataId.add(key) 

2614 elif key.dataId["instrument"] not in instruments: 

2615 if logFailures: 

2616 log.critical("Key '%s' has unknown instrument", key) 

2617 failedDataId.add(key) 

2618 

2619 messages = [] 

2620 

2621 if datastoreErrorStr: 

2622 messages.append(datastoreErrorStr) 

2623 

2624 for failed, msg in ( 

2625 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2626 (failedDataId, "Keys with bad DataId entries: "), 

2627 ): 

2628 if failed: 

2629 msg += ", ".join(str(k) for k in failed) 

2630 messages.append(msg) 

2631 

2632 if messages: 

2633 raise ValidationError(";\n".join(messages)) 

2634 

2635 @property 

2636 def collections(self) -> Sequence[str]: 

2637 """The collections to search by default, in order 

2638 (`~collections.abc.Sequence` [ `str` ]). 

2639 

2640 This is an alias for ``self.registry.defaults.collections``. It cannot 

2641 be set directly in isolation, but all defaults may be changed together 

2642 by assigning a new `RegistryDefaults` instance to 

2643 ``self.registry.defaults``. 

2644 """ 

2645 return self._registry.defaults.collections 

2646 

2647 @property 

2648 def run(self) -> str | None: 

2649 """Name of the run this butler writes outputs to by default (`str` or 

2650 `None`). 

2651 

2652 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2653 directly in isolation, but all defaults may be changed together by 

2654 assigning a new `RegistryDefaults` instance to 

2655 ``self.registry.defaults``. 

2656 """ 

2657 return self._registry.defaults.run 

2658 

2659 @property 

2660 def registry(self) -> Registry: 

2661 """The object that manages dataset metadata and relationships 

2662 (`Registry`). 

2663 

2664 Many operations that don't involve reading or writing butler datasets 

2665 are accessible only via `Registry` methods. Eventually these methods 

2666 will be replaced by equivalent `Butler` methods. 

2667 """ 

2668 return self._registry_shim 

2669 

2670 @property 

2671 def dimensions(self) -> DimensionUniverse: 

2672 # Docstring inherited. 

2673 return self._registry.dimensions 

2674 

2675 _registry: _ButlerRegistry 

2676 """The object that manages dataset metadata and relationships 

2677 (`_ButlerRegistry`). 

2678 

2679 Most operations that don't involve reading or writing butler datasets are 

2680 accessible only via `Registry` methods. 

2681 """ 

2682 

2683 datastore: Datastore 

2684 """The object that manages actual dataset storage (`Datastore`). 

2685 

2686 Direct user access to the datastore should rarely be necessary; the primary 

2687 exception is the case where a `Datastore` implementation provides extra 

2688 functionality beyond what the base class defines. 

2689 """ 

2690 

2691 storageClasses: StorageClassFactory 

2692 """An object that maps known storage class names to objects that fully 

2693 describe them (`StorageClassFactory`). 

2694 """