Coverage for python/lsst/daf/butler/_butler.py: 11%

723 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-21 09:55 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22"""Butler top level classes. 

23""" 

24from __future__ import annotations 

25 

26__all__ = ( 

27 "Butler", 

28 "ButlerValidationError", 

29) 

30 

31import collections.abc 

32import contextlib 

33import io 

34import logging 

35import numbers 

36import os 

37import warnings 

38from collections import Counter, defaultdict 

39from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

40from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

41 

42from deprecated.sphinx import deprecated 

43from lsst.resources import ResourcePath, ResourcePathExpression 

44from lsst.utils import doImportType 

45from lsst.utils.introspection import get_class_of 

46from lsst.utils.logging import VERBOSE, getLogger 

47from sqlalchemy.exc import IntegrityError 

48 

49from ._butlerConfig import ButlerConfig 

50from ._butlerRepoIndex import ButlerRepoIndex 

51from ._dataset_existence import DatasetExistence 

52from ._deferredDatasetHandle import DeferredDatasetHandle 

53from ._limited_butler import LimitedButler 

54from ._registry_shim import RegistryShim 

55from .core import ( 

56 Config, 

57 ConfigSubset, 

58 DataCoordinate, 

59 DataId, 

60 DataIdValue, 

61 DatasetIdGenEnum, 

62 DatasetRef, 

63 DatasetRefURIs, 

64 DatasetType, 

65 Datastore, 

66 Dimension, 

67 DimensionConfig, 

68 DimensionElement, 

69 DimensionRecord, 

70 DimensionUniverse, 

71 FileDataset, 

72 Progress, 

73 StorageClass, 

74 StorageClassFactory, 

75 Timespan, 

76 ValidationError, 

77) 

78from .core.repoRelocation import BUTLER_ROOT_TAG 

79from .core.utils import transactional 

80from .registry import ( 

81 CollectionType, 

82 ConflictingDefinitionError, 

83 DataIdError, 

84 MissingDatasetTypeError, 

85 NoDefaultCollectionError, 

86 Registry, 

87 RegistryConfig, 

88 RegistryDefaults, 

89 _ButlerRegistry, 

90 _RegistryFactory, 

91) 

92from .transfers import RepoExportContext 

93 

94if TYPE_CHECKING: 

95 from lsst.resources import ResourceHandleProtocol 

96 

97 from .transfers import RepoImportBackend 

98 

99log = getLogger(__name__) 

100 

101 

102class ButlerValidationError(ValidationError): 

103 """There is a problem with the Butler configuration.""" 

104 

105 pass 

106 

107 

108class Butler(LimitedButler): 

109 """Main entry point for the data access system. 

110 

111 Parameters 

112 ---------- 

113 config : `ButlerConfig`, `Config` or `str`, optional. 

114 Configuration. Anything acceptable to the 

115 `ButlerConfig` constructor. If a directory path 

116 is given the configuration will be read from a ``butler.yaml`` file in 

117 that location. If `None` is given default values will be used. 

118 butler : `Butler`, optional. 

119 If provided, construct a new Butler that uses the same registry and 

120 datastore as the given one, but with the given collection and run. 

121 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

122 arguments. 

123 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

124 An expression specifying the collections to be searched (in order) when 

125 reading datasets. 

126 This may be a `str` collection name or an iterable thereof. 

127 See :ref:`daf_butler_collection_expressions` for more information. 

128 These collections are not registered automatically and must be 

129 manually registered before they are used by any method, but they may be 

130 manually registered after the `Butler` is initialized. 

131 run : `str`, optional 

132 Name of the `~CollectionType.RUN` collection new datasets should be 

133 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

134 ``collections`` will be set to ``[run]``. If not `None`, this 

135 collection will automatically be registered. If this is not set (and 

136 ``writeable`` is not set either), a read-only butler will be created. 

137 searchPaths : `list` of `str`, optional 

138 Directory paths to search when calculating the full Butler 

139 configuration. Not used if the supplied config is already a 

140 `ButlerConfig`. 

141 writeable : `bool`, optional 

142 Explicitly sets whether the butler supports write operations. If not 

143 provided, a read-write butler is created if any of ``run``, ``tags``, 

144 or ``chains`` is non-empty. 

145 inferDefaults : `bool`, optional 

146 If `True` (default) infer default data ID values from the values 

147 present in the datasets in ``collections``: if all collections have the 

148 same value (or no value) for a governor dimension, that value will be 

149 the default for that dimension. Nonexistent collections are ignored. 

150 If a default value is provided explicitly for a governor dimension via 

151 ``**kwargs``, no default will be inferred for that dimension. 

152 **kwargs : `str` 

153 Default data ID key-value pairs. These may only identify "governor" 

154 dimensions like ``instrument`` and ``skymap``. 

155 

156 Examples 

157 -------- 

158 While there are many ways to control exactly how a `Butler` interacts with 

159 the collections in its `Registry`, the most common cases are still simple. 

160 

161 For a read-only `Butler` that searches one collection, do:: 

162 

163 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

164 

165 For a read-write `Butler` that writes to and reads from a 

166 `~CollectionType.RUN` collection:: 

167 

168 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

169 

170 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

171 because we want to write to one `~CollectionType.RUN` collection but read 

172 from several others (as well):: 

173 

174 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

175 collections=["u/alice/DM-50000/a", 

176 "u/bob/DM-49998", 

177 "HSC/defaults"]) 

178 

179 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

180 Datasets will be read first from that run (since it appears first in the 

181 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

182 

183 Finally, one can always create a `Butler` with no collections:: 

184 

185 butler = Butler("/path/to/repo", writeable=True) 

186 

187 This can be extremely useful when you just want to use ``butler.registry``, 

188 e.g. for inserting dimension data or managing collections, or when the 

189 collections you want to use with the butler are not consistent. 

190 Passing ``writeable`` explicitly here is only necessary if you want to be 

191 able to make changes to the repo - usually the value for ``writeable`` can 

192 be guessed from the collection arguments provided, but it defaults to 

193 `False` when there are not collection arguments. 

194 """ 

195 

196 def __init__( 

197 self, 

198 config: Config | ResourcePathExpression | None = None, 

199 *, 

200 butler: Butler | None = None, 

201 collections: Any = None, 

202 run: str | None = None, 

203 searchPaths: Sequence[ResourcePathExpression] | None = None, 

204 writeable: bool | None = None, 

205 inferDefaults: bool = True, 

206 **kwargs: str, 

207 ): 

208 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

209 # Load registry, datastore, etc. from config or existing butler. 

210 if butler is not None: 

211 if config is not None or searchPaths is not None or writeable is not None: 

212 raise TypeError( 

213 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

214 ) 

215 self._registry = butler._registry.copy(defaults) 

216 self._datastore = butler._datastore 

217 self.storageClasses = butler.storageClasses 

218 self._config: ButlerConfig = butler._config 

219 else: 

220 self._config = ButlerConfig(config, searchPaths=searchPaths) 

221 try: 

222 if "root" in self._config: 

223 butlerRoot = self._config["root"] 

224 else: 

225 butlerRoot = self._config.configDir 

226 if writeable is None: 

227 writeable = run is not None 

228 self._registry = _RegistryFactory(self._config).from_config( 

229 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

230 ) 

231 self._datastore = Datastore.fromConfig( 

232 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

233 ) 

234 self.storageClasses = StorageClassFactory() 

235 self.storageClasses.addFromConfig(self._config) 

236 except Exception: 

237 # Failures here usually mean that configuration is incomplete, 

238 # just issue an error message which includes config file URI. 

239 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

240 raise 

241 

242 # For execution butler the datastore needs a special 

243 # dependency-inversion trick. This is not used by regular butler, 

244 # but we do not have a way to distinguish regular butler from execution 

245 # butler. 

246 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

247 

248 if "run" in self._config or "collection" in self._config: 

249 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

250 

251 self._registry_shim = RegistryShim(self) 

252 

253 GENERATION: ClassVar[int] = 3 

254 """This is a Generation 3 Butler. 

255 

256 This attribute may be removed in the future, once the Generation 2 Butler 

257 interface has been fully retired; it should only be used in transitional 

258 code. 

259 """ 

260 

261 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

262 """Return DatasetType defined in registry given dataset type name.""" 

263 try: 

264 return self._registry.getDatasetType(name) 

265 except MissingDatasetTypeError: 

266 return None 

267 

268 @classmethod 

269 def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: 

270 """Look up the label in a butler repository index. 

271 

272 Parameters 

273 ---------- 

274 label : `str` 

275 Label of the Butler repository to look up. 

276 return_label : `bool`, optional 

277 If ``label`` cannot be found in the repository index (either 

278 because index is not defined or ``label`` is not in the index) and 

279 ``return_label`` is `True` then return ``ResourcePath(label)``. 

280 If ``return_label`` is `False` (default) then an exception will be 

281 raised instead. 

282 

283 Returns 

284 ------- 

285 uri : `lsst.resources.ResourcePath` 

286 URI to the Butler repository associated with the given label or 

287 default value if it is provided. 

288 

289 Raises 

290 ------ 

291 KeyError 

292 Raised if the label is not found in the index, or if an index 

293 is not defined, and ``return_label`` is `False`. 

294 

295 Notes 

296 ----- 

297 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

298 information is discovered. 

299 """ 

300 return ButlerRepoIndex.get_repo_uri(label, return_label) 

301 

302 @classmethod 

303 def get_known_repos(cls) -> set[str]: 

304 """Retrieve the list of known repository labels. 

305 

306 Returns 

307 ------- 

308 repos : `set` of `str` 

309 All the known labels. Can be empty if no index can be found. 

310 

311 Notes 

312 ----- 

313 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

314 information is discovered. 

315 """ 

316 return ButlerRepoIndex.get_known_repos() 

317 

318 @staticmethod 

319 def makeRepo( 

320 root: ResourcePathExpression, 

321 config: Config | str | None = None, 

322 dimensionConfig: Config | str | None = None, 

323 standalone: bool = False, 

324 searchPaths: list[str] | None = None, 

325 forceConfigRoot: bool = True, 

326 outfile: ResourcePathExpression | None = None, 

327 overwrite: bool = False, 

328 ) -> Config: 

329 """Create an empty data repository by adding a butler.yaml config 

330 to a repository root directory. 

331 

332 Parameters 

333 ---------- 

334 root : `lsst.resources.ResourcePathExpression` 

335 Path or URI to the root location of the new repository. Will be 

336 created if it does not exist. 

337 config : `Config` or `str`, optional 

338 Configuration to write to the repository, after setting any 

339 root-dependent Registry or Datastore config options. Can not 

340 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

341 configuration will be used. Root-dependent config options 

342 specified in this config are overwritten if ``forceConfigRoot`` 

343 is `True`. 

344 dimensionConfig : `Config` or `str`, optional 

345 Configuration for dimensions, will be used to initialize registry 

346 database. 

347 standalone : `bool` 

348 If True, write all expanded defaults, not just customized or 

349 repository-specific settings. 

350 This (mostly) decouples the repository from the default 

351 configuration, insulating it from changes to the defaults (which 

352 may be good or bad, depending on the nature of the changes). 

353 Future *additions* to the defaults will still be picked up when 

354 initializing `Butlers` to repos created with ``standalone=True``. 

355 searchPaths : `list` of `str`, optional 

356 Directory paths to search when calculating the full butler 

357 configuration. 

358 forceConfigRoot : `bool`, optional 

359 If `False`, any values present in the supplied ``config`` that 

360 would normally be reset are not overridden and will appear 

361 directly in the output config. This allows non-standard overrides 

362 of the root directory for a datastore or registry to be given. 

363 If this parameter is `True` the values for ``root`` will be 

364 forced into the resulting config if appropriate. 

365 outfile : `lss.resources.ResourcePathExpression`, optional 

366 If not-`None`, the output configuration will be written to this 

367 location rather than into the repository itself. Can be a URI 

368 string. Can refer to a directory that will be used to write 

369 ``butler.yaml``. 

370 overwrite : `bool`, optional 

371 Create a new configuration file even if one already exists 

372 in the specified output location. Default is to raise 

373 an exception. 

374 

375 Returns 

376 ------- 

377 config : `Config` 

378 The updated `Config` instance written to the repo. 

379 

380 Raises 

381 ------ 

382 ValueError 

383 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

384 regular Config (as these subclasses would make it impossible to 

385 support ``standalone=False``). 

386 FileExistsError 

387 Raised if the output config file already exists. 

388 os.error 

389 Raised if the directory does not exist, exists but is not a 

390 directory, or cannot be created. 

391 

392 Notes 

393 ----- 

394 Note that when ``standalone=False`` (the default), the configuration 

395 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

396 construct the repository should also be used to construct any Butlers 

397 to avoid configuration inconsistencies. 

398 """ 

399 if isinstance(config, (ButlerConfig, ConfigSubset)): 

400 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

401 

402 # Ensure that the root of the repository exists or can be made 

403 root_uri = ResourcePath(root, forceDirectory=True) 

404 root_uri.mkdir() 

405 

406 config = Config(config) 

407 

408 # If we are creating a new repo from scratch with relative roots, 

409 # do not propagate an explicit root from the config file 

410 if "root" in config: 

411 del config["root"] 

412 

413 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

414 imported_class = doImportType(full["datastore", "cls"]) 

415 if not issubclass(imported_class, Datastore): 

416 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

417 datastoreClass: type[Datastore] = imported_class 

418 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

419 

420 # if key exists in given config, parse it, otherwise parse the defaults 

421 # in the expanded config 

422 if config.get(("registry", "db")): 

423 registryConfig = RegistryConfig(config) 

424 else: 

425 registryConfig = RegistryConfig(full) 

426 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

427 if defaultDatabaseUri is not None: 

428 Config.updateParameters( 

429 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

430 ) 

431 else: 

432 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

433 

434 if standalone: 

435 config.merge(full) 

436 else: 

437 # Always expand the registry.managers section into the per-repo 

438 # config, because after the database schema is created, it's not 

439 # allowed to change anymore. Note that in the standalone=True 

440 # branch, _everything_ in the config is expanded, so there's no 

441 # need to special case this. 

442 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

443 configURI: ResourcePathExpression 

444 if outfile is not None: 

445 # When writing to a separate location we must include 

446 # the root of the butler repo in the config else it won't know 

447 # where to look. 

448 config["root"] = root_uri.geturl() 

449 configURI = outfile 

450 else: 

451 configURI = root_uri 

452 # Strip obscore configuration, if it is present, before writing config 

453 # to a file, obscore config will be stored in registry. 

454 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

455 config_to_write = config.copy() 

456 del config_to_write[obscore_config_key] 

457 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

458 # configFile attribute is updated, need to copy it to original. 

459 config.configFile = config_to_write.configFile 

460 else: 

461 config.dumpToUri(configURI, overwrite=overwrite) 

462 

463 # Create Registry and populate tables 

464 registryConfig = RegistryConfig(config.get("registry")) 

465 dimensionConfig = DimensionConfig(dimensionConfig) 

466 _RegistryFactory(registryConfig).create_from_config( 

467 dimensionConfig=dimensionConfig, butlerRoot=root_uri 

468 ) 

469 

470 log.verbose("Wrote new Butler configuration file to %s", configURI) 

471 

472 return config 

473 

474 @classmethod 

475 def _unpickle( 

476 cls, 

477 config: ButlerConfig, 

478 collections: tuple[str, ...] | None, 

479 run: str | None, 

480 defaultDataId: dict[str, str], 

481 writeable: bool, 

482 ) -> Butler: 

483 """Callable used to unpickle a Butler. 

484 

485 We prefer not to use ``Butler.__init__`` directly so we can force some 

486 of its many arguments to be keyword-only (note that ``__reduce__`` 

487 can only invoke callables with positional arguments). 

488 

489 Parameters 

490 ---------- 

491 config : `ButlerConfig` 

492 Butler configuration, already coerced into a true `ButlerConfig` 

493 instance (and hence after any search paths for overrides have been 

494 utilized). 

495 collections : `tuple` [ `str` ] 

496 Names of the default collections to read from. 

497 run : `str`, optional 

498 Name of the default `~CollectionType.RUN` collection to write to. 

499 defaultDataId : `dict` [ `str`, `str` ] 

500 Default data ID values. 

501 writeable : `bool` 

502 Whether the Butler should support write operations. 

503 

504 Returns 

505 ------- 

506 butler : `Butler` 

507 A new `Butler` instance. 

508 """ 

509 # MyPy doesn't recognize that the kwargs below are totally valid; it 

510 # seems to think '**defaultDataId* is a _positional_ argument! 

511 return cls( 

512 config=config, 

513 collections=collections, 

514 run=run, 

515 writeable=writeable, 

516 **defaultDataId, # type: ignore 

517 ) 

518 

519 def __reduce__(self) -> tuple: 

520 """Support pickling.""" 

521 return ( 

522 Butler._unpickle, 

523 ( 

524 self._config, 

525 self.collections, 

526 self.run, 

527 self._registry.defaults.dataId.byName(), 

528 self._registry.isWriteable(), 

529 ), 

530 ) 

531 

532 def __str__(self) -> str: 

533 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

534 self.collections, self.run, self._datastore, self._registry 

535 ) 

536 

537 def isWriteable(self) -> bool: 

538 """Return `True` if this `Butler` supports write operations.""" 

539 return self._registry.isWriteable() 

540 

541 @contextlib.contextmanager 

542 def transaction(self) -> Iterator[None]: 

543 """Context manager supporting `Butler` transactions. 

544 

545 Transactions can be nested. 

546 """ 

547 with self._registry.transaction(): 

548 with self._datastore.transaction(): 

549 yield 

550 

551 def _standardizeArgs( 

552 self, 

553 datasetRefOrType: DatasetRef | DatasetType | str, 

554 dataId: DataId | None = None, 

555 for_put: bool = True, 

556 **kwargs: Any, 

557 ) -> tuple[DatasetType, DataId | None]: 

558 """Standardize the arguments passed to several Butler APIs. 

559 

560 Parameters 

561 ---------- 

562 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

563 When `DatasetRef` the `dataId` should be `None`. 

564 Otherwise the `DatasetType` or name thereof. 

565 dataId : `dict` or `DataCoordinate` 

566 A `dict` of `Dimension` link name, value pairs that label the 

567 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

568 should be provided as the second argument. 

569 for_put : `bool`, optional 

570 If `True` this call is invoked as part of a `Butler.put()`. 

571 Otherwise it is assumed to be part of a `Butler.get()`. This 

572 parameter is only relevant if there is dataset type 

573 inconsistency. 

574 **kwargs 

575 Additional keyword arguments used to augment or construct a 

576 `DataCoordinate`. See `DataCoordinate.standardize` 

577 parameters. 

578 

579 Returns 

580 ------- 

581 datasetType : `DatasetType` 

582 A `DatasetType` instance extracted from ``datasetRefOrType``. 

583 dataId : `dict` or `DataId`, optional 

584 Argument that can be used (along with ``kwargs``) to construct a 

585 `DataId`. 

586 

587 Notes 

588 ----- 

589 Butler APIs that conceptually need a DatasetRef also allow passing a 

590 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

591 keyword arguments that can be used to construct one) separately. This 

592 method accepts those arguments and always returns a true `DatasetType` 

593 and a `DataId` or `dict`. 

594 

595 Standardization of `dict` vs `DataId` is best handled by passing the 

596 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

597 generally similarly flexible. 

598 """ 

599 externalDatasetType: DatasetType | None = None 

600 internalDatasetType: DatasetType | None = None 

601 if isinstance(datasetRefOrType, DatasetRef): 

602 if dataId is not None or kwargs: 

603 raise ValueError("DatasetRef given, cannot use dataId as well") 

604 externalDatasetType = datasetRefOrType.datasetType 

605 dataId = datasetRefOrType.dataId 

606 else: 

607 # Don't check whether DataId is provided, because Registry APIs 

608 # can usually construct a better error message when it wasn't. 

609 if isinstance(datasetRefOrType, DatasetType): 

610 externalDatasetType = datasetRefOrType 

611 else: 

612 internalDatasetType = self._registry.getDatasetType(datasetRefOrType) 

613 

614 # Check that they are self-consistent 

615 if externalDatasetType is not None: 

616 internalDatasetType = self._registry.getDatasetType(externalDatasetType.name) 

617 if externalDatasetType != internalDatasetType: 

618 # We can allow differences if they are compatible, depending 

619 # on whether this is a get or a put. A get requires that 

620 # the python type associated with the datastore can be 

621 # converted to the user type. A put requires that the user 

622 # supplied python type can be converted to the internal 

623 # type expected by registry. 

624 relevantDatasetType = internalDatasetType 

625 if for_put: 

626 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

627 else: 

628 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

629 relevantDatasetType = externalDatasetType 

630 if not is_compatible: 

631 raise ValueError( 

632 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

633 f"registry definition ({internalDatasetType})" 

634 ) 

635 # Override the internal definition. 

636 internalDatasetType = relevantDatasetType 

637 

638 assert internalDatasetType is not None 

639 return internalDatasetType, dataId 

640 

641 def _rewrite_data_id( 

642 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

643 ) -> tuple[DataId | None, dict[str, Any]]: 

644 """Rewrite a data ID taking into account dimension records. 

645 

646 Take a Data ID and keyword args and rewrite it if necessary to 

647 allow the user to specify dimension records rather than dimension 

648 primary values. 

649 

650 This allows a user to include a dataId dict with keys of 

651 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

652 the integer exposure ID. It also allows a string to be given 

653 for a dimension value rather than the integer ID if that is more 

654 convenient. For example, rather than having to specifying the 

655 detector with ``detector.full_name``, a string given for ``detector`` 

656 will be interpreted as the full name and converted to the integer 

657 value. 

658 

659 Keyword arguments can also use strings for dimensions like detector 

660 and exposure but python does not allow them to include ``.`` and 

661 so the ``exposure.day_obs`` syntax can not be used in a keyword 

662 argument. 

663 

664 Parameters 

665 ---------- 

666 dataId : `dict` or `DataCoordinate` 

667 A `dict` of `Dimension` link name, value pairs that will label the 

668 `DatasetRef` within a Collection. 

669 datasetType : `DatasetType` 

670 The dataset type associated with this dataId. Required to 

671 determine the relevant dimensions. 

672 **kwargs 

673 Additional keyword arguments used to augment or construct a 

674 `DataId`. See `DataId` parameters. 

675 

676 Returns 

677 ------- 

678 dataId : `dict` or `DataCoordinate` 

679 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

680 no keyword arguments, the original dataId will be returned 

681 unchanged. 

682 **kwargs : `dict` 

683 Any unused keyword arguments (would normally be empty dict). 

684 """ 

685 # Do nothing if we have a standalone DataCoordinate. 

686 if isinstance(dataId, DataCoordinate) and not kwargs: 

687 return dataId, kwargs 

688 

689 # Process dimension records that are using record information 

690 # rather than ids 

691 newDataId: dict[str, DataIdValue] = {} 

692 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

693 

694 # if all the dataId comes from keyword parameters we do not need 

695 # to do anything here because they can't be of the form 

696 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

697 if dataId: 

698 for k, v in dataId.items(): 

699 # If we have a Dimension we do not need to do anything 

700 # because it cannot be a compound key. 

701 if isinstance(k, str) and "." in k: 

702 # Someone is using a more human-readable dataId 

703 dimensionName, record = k.split(".", 1) 

704 byRecord[dimensionName][record] = v 

705 elif isinstance(k, Dimension): 

706 newDataId[k.name] = v 

707 else: 

708 newDataId[k] = v 

709 

710 # Go through the updated dataId and check the type in case someone is 

711 # using an alternate key. We have already filtered out the compound 

712 # keys dimensions.record format. 

713 not_dimensions = {} 

714 

715 # Will need to look in the dataId and the keyword arguments 

716 # and will remove them if they need to be fixed or are unrecognized. 

717 for dataIdDict in (newDataId, kwargs): 

718 # Use a list so we can adjust the dict safely in the loop 

719 for dimensionName in list(dataIdDict): 

720 value = dataIdDict[dimensionName] 

721 try: 

722 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

723 except KeyError: 

724 # This is not a real dimension 

725 not_dimensions[dimensionName] = value 

726 del dataIdDict[dimensionName] 

727 continue 

728 

729 # Convert an integral type to an explicit int to simplify 

730 # comparisons here 

731 if isinstance(value, numbers.Integral): 

732 value = int(value) 

733 

734 if not isinstance(value, dimension.primaryKey.getPythonType()): 

735 for alternate in dimension.alternateKeys: 

736 if isinstance(value, alternate.getPythonType()): 

737 byRecord[dimensionName][alternate.name] = value 

738 del dataIdDict[dimensionName] 

739 log.debug( 

740 "Converting dimension %s to %s.%s=%s", 

741 dimensionName, 

742 dimensionName, 

743 alternate.name, 

744 value, 

745 ) 

746 break 

747 else: 

748 log.warning( 

749 "Type mismatch found for value '%r' provided for dimension %s. " 

750 "Could not find matching alternative (primary key has type %s) " 

751 "so attempting to use as-is.", 

752 value, 

753 dimensionName, 

754 dimension.primaryKey.getPythonType(), 

755 ) 

756 

757 # By this point kwargs and newDataId should only include valid 

758 # dimensions. Merge kwargs in to the new dataId and log if there 

759 # are dimensions in both (rather than calling update). 

760 for k, v in kwargs.items(): 

761 if k in newDataId and newDataId[k] != v: 

762 log.debug( 

763 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

764 ) 

765 newDataId[k] = v 

766 # No need to retain any values in kwargs now. 

767 kwargs = {} 

768 

769 # If we have some unrecognized dimensions we have to try to connect 

770 # them to records in other dimensions. This is made more complicated 

771 # by some dimensions having records with clashing names. A mitigation 

772 # is that we can tell by this point which dimensions are missing 

773 # for the DatasetType but this does not work for calibrations 

774 # where additional dimensions can be used to constrain the temporal 

775 # axis. 

776 if not_dimensions: 

777 # Search for all dimensions even if we have been given a value 

778 # explicitly. In some cases records are given as well as the 

779 # actually dimension and this should not be an error if they 

780 # match. 

781 mandatoryDimensions = datasetType.dimensions.names # - provided 

782 

783 candidateDimensions: set[str] = set() 

784 candidateDimensions.update(mandatoryDimensions) 

785 

786 # For calibrations we may well be needing temporal dimensions 

787 # so rather than always including all dimensions in the scan 

788 # restrict things a little. It is still possible for there 

789 # to be confusion over day_obs in visit vs exposure for example. 

790 # If we are not searching calibration collections things may 

791 # fail but they are going to fail anyway because of the 

792 # ambiguousness of the dataId... 

793 if datasetType.isCalibration(): 

794 for dim in self.dimensions.getStaticDimensions(): 

795 if dim.temporal: 

796 candidateDimensions.add(str(dim)) 

797 

798 # Look up table for the first association with a dimension 

799 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

800 

801 # Keep track of whether an item is associated with multiple 

802 # dimensions. 

803 counter: Counter[str] = Counter() 

804 assigned: dict[str, set[str]] = defaultdict(set) 

805 

806 # Go through the missing dimensions and associate the 

807 # given names with records within those dimensions 

808 matched_dims = set() 

809 for dimensionName in candidateDimensions: 

810 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

811 fields = dimension.metadata.names | dimension.uniqueKeys.names 

812 for field in not_dimensions: 

813 if field in fields: 

814 guessedAssociation[dimensionName][field] = not_dimensions[field] 

815 counter[dimensionName] += 1 

816 assigned[field].add(dimensionName) 

817 matched_dims.add(field) 

818 

819 # Calculate the fields that matched nothing. 

820 never_found = set(not_dimensions) - matched_dims 

821 

822 if never_found: 

823 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

824 

825 # There is a chance we have allocated a single dataId item 

826 # to multiple dimensions. Need to decide which should be retained. 

827 # For now assume that the most popular alternative wins. 

828 # This means that day_obs with seq_num will result in 

829 # exposure.day_obs and not visit.day_obs 

830 # Also prefer an explicitly missing dimension over an inferred 

831 # temporal dimension. 

832 for fieldName, assignedDimensions in assigned.items(): 

833 if len(assignedDimensions) > 1: 

834 # Pick the most popular (preferring mandatory dimensions) 

835 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

836 if requiredButMissing: 

837 candidateDimensions = requiredButMissing 

838 else: 

839 candidateDimensions = assignedDimensions 

840 

841 # If this is a choice between visit and exposure and 

842 # neither was a required part of the dataset type, 

843 # (hence in this branch) always prefer exposure over 

844 # visit since exposures are always defined and visits 

845 # are defined from exposures. 

846 if candidateDimensions == {"exposure", "visit"}: 

847 candidateDimensions = {"exposure"} 

848 

849 # Select the relevant items and get a new restricted 

850 # counter. 

851 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

852 duplicatesCounter: Counter[str] = Counter() 

853 duplicatesCounter.update(theseCounts) 

854 

855 # Choose the most common. If they are equally common 

856 # we will pick the one that was found first. 

857 # Returns a list of tuples 

858 selected = duplicatesCounter.most_common(1)[0][0] 

859 

860 log.debug( 

861 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

862 " Removed ambiguity by choosing dimension %s.", 

863 fieldName, 

864 ", ".join(assignedDimensions), 

865 selected, 

866 ) 

867 

868 for candidateDimension in assignedDimensions: 

869 if candidateDimension != selected: 

870 del guessedAssociation[candidateDimension][fieldName] 

871 

872 # Update the record look up dict with the new associations 

873 for dimensionName, values in guessedAssociation.items(): 

874 if values: # A dict might now be empty 

875 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

876 byRecord[dimensionName].update(values) 

877 

878 if byRecord: 

879 # Some record specifiers were found so we need to convert 

880 # them to the Id form 

881 for dimensionName, values in byRecord.items(): 

882 if dimensionName in newDataId: 

883 log.debug( 

884 "DataId specified explicit %s dimension value of %s in addition to" 

885 " general record specifiers for it of %s. Ignoring record information.", 

886 dimensionName, 

887 newDataId[dimensionName], 

888 str(values), 

889 ) 

890 # Get the actual record and compare with these values. 

891 try: 

892 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

893 except DataIdError: 

894 raise ValueError( 

895 f"Could not find dimension '{dimensionName}'" 

896 f" with dataId {newDataId} as part of comparing with" 

897 f" record values {byRecord[dimensionName]}" 

898 ) from None 

899 if len(recs) == 1: 

900 errmsg: list[str] = [] 

901 for k, v in values.items(): 

902 if (recval := getattr(recs[0], k)) != v: 

903 errmsg.append(f"{k}({recval} != {v})") 

904 if errmsg: 

905 raise ValueError( 

906 f"Dimension {dimensionName} in dataId has explicit value" 

907 " inconsistent with records: " + ", ".join(errmsg) 

908 ) 

909 else: 

910 # Multiple matches for an explicit dimension 

911 # should never happen but let downstream complain. 

912 pass 

913 continue 

914 

915 # Build up a WHERE expression 

916 bind = {k: v for k, v in values.items()} 

917 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

918 

919 # Hopefully we get a single record that matches 

920 records = set( 

921 self._registry.queryDimensionRecords( 

922 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

923 ) 

924 ) 

925 

926 if len(records) != 1: 

927 if len(records) > 1: 

928 # visit can have an ambiguous answer without involving 

929 # visit_system. The default visit_system is defined 

930 # by the instrument. 

931 if ( 

932 dimensionName == "visit" 

933 and "visit_system_membership" in self.dimensions 

934 and "visit_system" in self.dimensions["instrument"].metadata 

935 ): 

936 instrument_records = list( 

937 self._registry.queryDimensionRecords( 

938 "instrument", 

939 dataId=newDataId, 

940 **kwargs, 

941 ) 

942 ) 

943 if len(instrument_records) == 1: 

944 visit_system = instrument_records[0].visit_system 

945 if visit_system is None: 

946 # Set to a value that will never match. 

947 visit_system = -1 

948 

949 # Look up each visit in the 

950 # visit_system_membership records. 

951 for rec in records: 

952 membership = list( 

953 self._registry.queryDimensionRecords( 

954 # Use bind to allow zero results. 

955 # This is a fully-specified query. 

956 "visit_system_membership", 

957 where="instrument = inst AND visit_system = system AND visit = v", 

958 bind=dict( 

959 inst=instrument_records[0].name, system=visit_system, v=rec.id 

960 ), 

961 ) 

962 ) 

963 if membership: 

964 # This record is the right answer. 

965 records = {rec} 

966 break 

967 

968 # The ambiguity may have been resolved so check again. 

969 if len(records) > 1: 

970 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

971 for r in records: 

972 log.debug("- %s", str(r)) 

973 raise ValueError( 

974 f"DataId specification for dimension {dimensionName} is not" 

975 f" uniquely constrained to a single dataset by {values}." 

976 f" Got {len(records)} results." 

977 ) 

978 else: 

979 raise ValueError( 

980 f"DataId specification for dimension {dimensionName} matched no" 

981 f" records when constrained by {values}" 

982 ) 

983 

984 # Get the primary key from the real dimension object 

985 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

986 if not isinstance(dimension, Dimension): 

987 raise RuntimeError( 

988 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

989 ) 

990 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

991 

992 return newDataId, kwargs 

993 

994 def _findDatasetRef( 

995 self, 

996 datasetRefOrType: DatasetRef | DatasetType | str, 

997 dataId: DataId | None = None, 

998 *, 

999 collections: Any = None, 

1000 predict: bool = False, 

1001 run: str | None = None, 

1002 **kwargs: Any, 

1003 ) -> DatasetRef: 

1004 """Shared logic for methods that start with a search for a dataset in 

1005 the registry. 

1006 

1007 Parameters 

1008 ---------- 

1009 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1010 When `DatasetRef` the `dataId` should be `None`. 

1011 Otherwise the `DatasetType` or name thereof. 

1012 dataId : `dict` or `DataCoordinate`, optional 

1013 A `dict` of `Dimension` link name, value pairs that label the 

1014 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1015 should be provided as the first argument. 

1016 collections : Any, optional 

1017 Collections to be searched, overriding ``self.collections``. 

1018 Can be any of the types supported by the ``collections`` argument 

1019 to butler construction. 

1020 predict : `bool`, optional 

1021 If `True`, return a newly created `DatasetRef` with a unique 

1022 dataset ID if finding a reference in the `Registry` fails. 

1023 Defaults to `False`. 

1024 run : `str`, optional 

1025 Run collection name to use for creating `DatasetRef` for predicted 

1026 datasets. Only used if ``predict`` is `True`. 

1027 **kwargs 

1028 Additional keyword arguments used to augment or construct a 

1029 `DataId`. See `DataId` parameters. 

1030 

1031 Returns 

1032 ------- 

1033 ref : `DatasetRef` 

1034 A reference to the dataset identified by the given arguments. 

1035 This can be the same dataset reference as given if it was 

1036 resolved. 

1037 

1038 Raises 

1039 ------ 

1040 LookupError 

1041 Raised if no matching dataset exists in the `Registry` (and 

1042 ``predict`` is `False`). 

1043 ValueError 

1044 Raised if a resolved `DatasetRef` was passed as an input, but it 

1045 differs from the one found in the registry. 

1046 TypeError 

1047 Raised if no collections were provided. 

1048 """ 

1049 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1050 if isinstance(datasetRefOrType, DatasetRef): 

1051 if collections is not None: 

1052 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

1053 return datasetRefOrType 

1054 timespan: Timespan | None = None 

1055 

1056 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1057 

1058 if datasetType.isCalibration(): 

1059 # Because this is a calibration dataset, first try to make a 

1060 # standardize the data ID without restricting the dimensions to 

1061 # those of the dataset type requested, because there may be extra 

1062 # dimensions that provide temporal information for a validity-range 

1063 # lookup. 

1064 dataId = DataCoordinate.standardize( 

1065 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

1066 ) 

1067 if dataId.graph.temporal: 

1068 dataId = self._registry.expandDataId(dataId) 

1069 timespan = dataId.timespan 

1070 else: 

1071 # Standardize the data ID to just the dimensions of the dataset 

1072 # type instead of letting registry.findDataset do it, so we get the 

1073 # result even if no dataset is found. 

1074 dataId = DataCoordinate.standardize( 

1075 dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

1076 ) 

1077 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1078 # present in the current collection. 

1079 ref = self._registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1080 if ref is None: 

1081 if predict: 

1082 if run is None: 

1083 run = self.run 

1084 if run is None: 

1085 raise TypeError("Cannot predict dataset ID/location with run=None.") 

1086 return DatasetRef(datasetType, dataId, run=run) 

1087 else: 

1088 if collections is None: 

1089 collections = self._registry.defaults.collections 

1090 raise LookupError( 

1091 f"Dataset {datasetType.name} with data ID {dataId} " 

1092 f"could not be found in collections {collections}." 

1093 ) 

1094 if datasetType != ref.datasetType: 

1095 # If they differ it is because the user explicitly specified 

1096 # a compatible dataset type to this call rather than using the 

1097 # registry definition. The DatasetRef must therefore be recreated 

1098 # using the user definition such that the expected type is 

1099 # returned. 

1100 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1101 

1102 return ref 

1103 

1104 @transactional 

1105 @deprecated( 

1106 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1107 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1108 " were relying on the run parameter to determine the run." 

1109 " Will be removed after v27.0.", 

1110 version="v26.0", 

1111 category=FutureWarning, 

1112 ) 

1113 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1114 # Docstring inherited. 

1115 return self.put(obj, ref) 

1116 

1117 @transactional 

1118 def put( 

1119 self, 

1120 obj: Any, 

1121 datasetRefOrType: DatasetRef | DatasetType | str, 

1122 /, 

1123 dataId: DataId | None = None, 

1124 *, 

1125 run: str | None = None, 

1126 **kwargs: Any, 

1127 ) -> DatasetRef: 

1128 """Store and register a dataset. 

1129 

1130 Parameters 

1131 ---------- 

1132 obj : `object` 

1133 The dataset. 

1134 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1135 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1136 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1137 `DatasetRef` is given the run and ID are used directly. 

1138 dataId : `dict` or `DataCoordinate` 

1139 A `dict` of `Dimension` link name, value pairs that label the 

1140 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1141 should be provided as the second argument. 

1142 run : `str`, optional 

1143 The name of the run the dataset should be added to, overriding 

1144 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1145 **kwargs 

1146 Additional keyword arguments used to augment or construct a 

1147 `DataCoordinate`. See `DataCoordinate.standardize` 

1148 parameters. Not used if a resolve `DatasetRef` is provided. 

1149 

1150 Returns 

1151 ------- 

1152 ref : `DatasetRef` 

1153 A reference to the stored dataset, updated with the correct id if 

1154 given. 

1155 

1156 Raises 

1157 ------ 

1158 TypeError 

1159 Raised if the butler is read-only or if no run has been provided. 

1160 """ 

1161 if isinstance(datasetRefOrType, DatasetRef): 

1162 # This is a direct put of predefined DatasetRef. 

1163 log.debug("Butler put direct: %s", datasetRefOrType) 

1164 if run is not None: 

1165 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

1166 # If registry already has a dataset with the same dataset ID, 

1167 # dataset type and DataId, then _importDatasets will do nothing and 

1168 # just return an original ref. We have to raise in this case, there 

1169 # is a datastore check below for that. 

1170 self._registry._importDatasets([datasetRefOrType], expand=True) 

1171 # Before trying to write to the datastore check that it does not 

1172 # know this dataset. This is prone to races, of course. 

1173 if self._datastore.knows(datasetRefOrType): 

1174 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

1175 # Try to write dataset to the datastore, if it fails due to a race 

1176 # with another write, the content of stored data may be 

1177 # unpredictable. 

1178 try: 

1179 self._datastore.put(obj, datasetRefOrType) 

1180 except IntegrityError as e: 

1181 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") 

1182 return datasetRefOrType 

1183 

1184 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1185 if not self.isWriteable(): 

1186 raise TypeError("Butler is read-only.") 

1187 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1188 

1189 # Handle dimension records in dataId 

1190 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1191 

1192 # Add Registry Dataset entry. 

1193 dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1194 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1195 self._datastore.put(obj, ref) 

1196 

1197 return ref 

1198 

1199 @deprecated( 

1200 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1201 " Please use Butler.get(). Will be removed after v27.0.", 

1202 version="v26.0", 

1203 category=FutureWarning, 

1204 ) 

1205 def getDirect( 

1206 self, 

1207 ref: DatasetRef, 

1208 *, 

1209 parameters: dict[str, Any] | None = None, 

1210 storageClass: StorageClass | str | None = None, 

1211 ) -> Any: 

1212 """Retrieve a stored dataset. 

1213 

1214 Parameters 

1215 ---------- 

1216 ref : `DatasetRef` 

1217 Resolved reference to an already stored dataset. 

1218 parameters : `dict` 

1219 Additional StorageClass-defined options to control reading, 

1220 typically used to efficiently read only a subset of the dataset. 

1221 storageClass : `StorageClass` or `str`, optional 

1222 The storage class to be used to override the Python type 

1223 returned by this method. By default the returned type matches 

1224 the dataset type definition for this dataset. Specifying a 

1225 read `StorageClass` can force a different type to be returned. 

1226 This type must be compatible with the original type. 

1227 

1228 Returns 

1229 ------- 

1230 obj : `object` 

1231 The dataset. 

1232 """ 

1233 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1234 

1235 @deprecated( 

1236 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1237 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1238 version="v26.0", 

1239 category=FutureWarning, 

1240 ) 

1241 def getDirectDeferred( 

1242 self, 

1243 ref: DatasetRef, 

1244 *, 

1245 parameters: dict | None = None, 

1246 storageClass: str | StorageClass | None = None, 

1247 ) -> DeferredDatasetHandle: 

1248 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1249 from a resolved `DatasetRef`. 

1250 

1251 Parameters 

1252 ---------- 

1253 ref : `DatasetRef` 

1254 Resolved reference to an already stored dataset. 

1255 parameters : `dict` 

1256 Additional StorageClass-defined options to control reading, 

1257 typically used to efficiently read only a subset of the dataset. 

1258 storageClass : `StorageClass` or `str`, optional 

1259 The storage class to be used to override the Python type 

1260 returned by this method. By default the returned type matches 

1261 the dataset type definition for this dataset. Specifying a 

1262 read `StorageClass` can force a different type to be returned. 

1263 This type must be compatible with the original type. 

1264 

1265 Returns 

1266 ------- 

1267 obj : `DeferredDatasetHandle` 

1268 A handle which can be used to retrieve a dataset at a later time. 

1269 

1270 Raises 

1271 ------ 

1272 LookupError 

1273 Raised if no matching dataset exists in the `Registry`. 

1274 """ 

1275 # Check that dataset actually exists. 

1276 if not self._datastore.exists(ref): 

1277 raise LookupError(f"Dataset reference {ref} does not exist.") 

1278 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1279 

1280 def getDeferred( 

1281 self, 

1282 datasetRefOrType: DatasetRef | DatasetType | str, 

1283 /, 

1284 dataId: DataId | None = None, 

1285 *, 

1286 parameters: dict | None = None, 

1287 collections: Any = None, 

1288 storageClass: str | StorageClass | None = None, 

1289 **kwargs: Any, 

1290 ) -> DeferredDatasetHandle: 

1291 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1292 after an immediate registry lookup. 

1293 

1294 Parameters 

1295 ---------- 

1296 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1297 When `DatasetRef` the `dataId` should be `None`. 

1298 Otherwise the `DatasetType` or name thereof. 

1299 dataId : `dict` or `DataCoordinate`, optional 

1300 A `dict` of `Dimension` link name, value pairs that label the 

1301 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1302 should be provided as the first argument. 

1303 parameters : `dict` 

1304 Additional StorageClass-defined options to control reading, 

1305 typically used to efficiently read only a subset of the dataset. 

1306 collections : Any, optional 

1307 Collections to be searched, overriding ``self.collections``. 

1308 Can be any of the types supported by the ``collections`` argument 

1309 to butler construction. 

1310 storageClass : `StorageClass` or `str`, optional 

1311 The storage class to be used to override the Python type 

1312 returned by this method. By default the returned type matches 

1313 the dataset type definition for this dataset. Specifying a 

1314 read `StorageClass` can force a different type to be returned. 

1315 This type must be compatible with the original type. 

1316 **kwargs 

1317 Additional keyword arguments used to augment or construct a 

1318 `DataId`. See `DataId` parameters. 

1319 

1320 Returns 

1321 ------- 

1322 obj : `DeferredDatasetHandle` 

1323 A handle which can be used to retrieve a dataset at a later time. 

1324 

1325 Raises 

1326 ------ 

1327 LookupError 

1328 Raised if no matching dataset exists in the `Registry`. 

1329 ValueError 

1330 Raised if a resolved `DatasetRef` was passed as an input, but it 

1331 differs from the one found in the registry. 

1332 TypeError 

1333 Raised if no collections were provided. 

1334 """ 

1335 if isinstance(datasetRefOrType, DatasetRef) and not self._datastore.exists(datasetRefOrType): 

1336 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1337 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1338 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1339 

1340 def get( 

1341 self, 

1342 datasetRefOrType: DatasetRef | DatasetType | str, 

1343 /, 

1344 dataId: DataId | None = None, 

1345 *, 

1346 parameters: dict[str, Any] | None = None, 

1347 collections: Any = None, 

1348 storageClass: StorageClass | str | None = None, 

1349 **kwargs: Any, 

1350 ) -> Any: 

1351 """Retrieve a stored dataset. 

1352 

1353 Parameters 

1354 ---------- 

1355 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1356 When `DatasetRef` the `dataId` should be `None`. 

1357 Otherwise the `DatasetType` or name thereof. 

1358 If a resolved `DatasetRef`, the associated dataset 

1359 is returned directly without additional querying. 

1360 dataId : `dict` or `DataCoordinate` 

1361 A `dict` of `Dimension` link name, value pairs that label the 

1362 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1363 should be provided as the first argument. 

1364 parameters : `dict` 

1365 Additional StorageClass-defined options to control reading, 

1366 typically used to efficiently read only a subset of the dataset. 

1367 collections : Any, optional 

1368 Collections to be searched, overriding ``self.collections``. 

1369 Can be any of the types supported by the ``collections`` argument 

1370 to butler construction. 

1371 storageClass : `StorageClass` or `str`, optional 

1372 The storage class to be used to override the Python type 

1373 returned by this method. By default the returned type matches 

1374 the dataset type definition for this dataset. Specifying a 

1375 read `StorageClass` can force a different type to be returned. 

1376 This type must be compatible with the original type. 

1377 **kwargs 

1378 Additional keyword arguments used to augment or construct a 

1379 `DataCoordinate`. See `DataCoordinate.standardize` 

1380 parameters. 

1381 

1382 Returns 

1383 ------- 

1384 obj : `object` 

1385 The dataset. 

1386 

1387 Raises 

1388 ------ 

1389 LookupError 

1390 Raised if no matching dataset exists in the `Registry`. 

1391 TypeError 

1392 Raised if no collections were provided. 

1393 

1394 Notes 

1395 ----- 

1396 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1397 this method requires that the given data ID include temporal dimensions 

1398 beyond the dimensions of the dataset type itself, in order to find the 

1399 dataset with the appropriate validity range. For example, a "bias" 

1400 dataset with native dimensions ``{instrument, detector}`` could be 

1401 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1402 ``exposure`` is a temporal dimension. 

1403 """ 

1404 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1405 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1406 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1407 

1408 def getURIs( 

1409 self, 

1410 datasetRefOrType: DatasetRef | DatasetType | str, 

1411 /, 

1412 dataId: DataId | None = None, 

1413 *, 

1414 predict: bool = False, 

1415 collections: Any = None, 

1416 run: str | None = None, 

1417 **kwargs: Any, 

1418 ) -> DatasetRefURIs: 

1419 """Return the URIs associated with the dataset. 

1420 

1421 Parameters 

1422 ---------- 

1423 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1424 When `DatasetRef` the `dataId` should be `None`. 

1425 Otherwise the `DatasetType` or name thereof. 

1426 dataId : `dict` or `DataCoordinate` 

1427 A `dict` of `Dimension` link name, value pairs that label the 

1428 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1429 should be provided as the first argument. 

1430 predict : `bool` 

1431 If `True`, allow URIs to be returned of datasets that have not 

1432 been written. 

1433 collections : Any, optional 

1434 Collections to be searched, overriding ``self.collections``. 

1435 Can be any of the types supported by the ``collections`` argument 

1436 to butler construction. 

1437 run : `str`, optional 

1438 Run to use for predictions, overriding ``self.run``. 

1439 **kwargs 

1440 Additional keyword arguments used to augment or construct a 

1441 `DataCoordinate`. See `DataCoordinate.standardize` 

1442 parameters. 

1443 

1444 Returns 

1445 ------- 

1446 uris : `DatasetRefURIs` 

1447 The URI to the primary artifact associated with this dataset (if 

1448 the dataset was disassembled within the datastore this may be 

1449 `None`), and the URIs to any components associated with the dataset 

1450 artifact. (can be empty if there are no components). 

1451 """ 

1452 ref = self._findDatasetRef( 

1453 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1454 ) 

1455 return self._datastore.getURIs(ref, predict) 

1456 

1457 def getURI( 

1458 self, 

1459 datasetRefOrType: DatasetRef | DatasetType | str, 

1460 /, 

1461 dataId: DataId | None = None, 

1462 *, 

1463 predict: bool = False, 

1464 collections: Any = None, 

1465 run: str | None = None, 

1466 **kwargs: Any, 

1467 ) -> ResourcePath: 

1468 """Return the URI to the Dataset. 

1469 

1470 Parameters 

1471 ---------- 

1472 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1473 When `DatasetRef` the `dataId` should be `None`. 

1474 Otherwise the `DatasetType` or name thereof. 

1475 dataId : `dict` or `DataCoordinate` 

1476 A `dict` of `Dimension` link name, value pairs that label the 

1477 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1478 should be provided as the first argument. 

1479 predict : `bool` 

1480 If `True`, allow URIs to be returned of datasets that have not 

1481 been written. 

1482 collections : Any, optional 

1483 Collections to be searched, overriding ``self.collections``. 

1484 Can be any of the types supported by the ``collections`` argument 

1485 to butler construction. 

1486 run : `str`, optional 

1487 Run to use for predictions, overriding ``self.run``. 

1488 **kwargs 

1489 Additional keyword arguments used to augment or construct a 

1490 `DataCoordinate`. See `DataCoordinate.standardize` 

1491 parameters. 

1492 

1493 Returns 

1494 ------- 

1495 uri : `lsst.resources.ResourcePath` 

1496 URI pointing to the Dataset within the datastore. If the 

1497 Dataset does not exist in the datastore, and if ``predict`` is 

1498 `True`, the URI will be a prediction and will include a URI 

1499 fragment "#predicted". 

1500 If the datastore does not have entities that relate well 

1501 to the concept of a URI the returned URI string will be 

1502 descriptive. The returned URI is not guaranteed to be obtainable. 

1503 

1504 Raises 

1505 ------ 

1506 LookupError 

1507 A URI has been requested for a dataset that does not exist and 

1508 guessing is not allowed. 

1509 ValueError 

1510 Raised if a resolved `DatasetRef` was passed as an input, but it 

1511 differs from the one found in the registry. 

1512 TypeError 

1513 Raised if no collections were provided. 

1514 RuntimeError 

1515 Raised if a URI is requested for a dataset that consists of 

1516 multiple artifacts. 

1517 """ 

1518 primary, components = self.getURIs( 

1519 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1520 ) 

1521 

1522 if primary is None or components: 

1523 raise RuntimeError( 

1524 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1525 "Use Butler.getURIs() instead." 

1526 ) 

1527 return primary 

1528 

1529 def retrieveArtifacts( 

1530 self, 

1531 refs: Iterable[DatasetRef], 

1532 destination: ResourcePathExpression, 

1533 transfer: str = "auto", 

1534 preserve_path: bool = True, 

1535 overwrite: bool = False, 

1536 ) -> list[ResourcePath]: 

1537 """Retrieve the artifacts associated with the supplied refs. 

1538 

1539 Parameters 

1540 ---------- 

1541 refs : iterable of `DatasetRef` 

1542 The datasets for which artifacts are to be retrieved. 

1543 A single ref can result in multiple artifacts. The refs must 

1544 be resolved. 

1545 destination : `lsst.resources.ResourcePath` or `str` 

1546 Location to write the artifacts. 

1547 transfer : `str`, optional 

1548 Method to use to transfer the artifacts. Must be one of the options 

1549 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1550 "move" is not allowed. 

1551 preserve_path : `bool`, optional 

1552 If `True` the full path of the artifact within the datastore 

1553 is preserved. If `False` the final file component of the path 

1554 is used. 

1555 overwrite : `bool`, optional 

1556 If `True` allow transfers to overwrite existing files at the 

1557 destination. 

1558 

1559 Returns 

1560 ------- 

1561 targets : `list` of `lsst.resources.ResourcePath` 

1562 URIs of file artifacts in destination location. Order is not 

1563 preserved. 

1564 

1565 Notes 

1566 ----- 

1567 For non-file datastores the artifacts written to the destination 

1568 may not match the representation inside the datastore. For example 

1569 a hierarchical data structure in a NoSQL database may well be stored 

1570 as a JSON file. 

1571 """ 

1572 return self._datastore.retrieveArtifacts( 

1573 refs, 

1574 ResourcePath(destination), 

1575 transfer=transfer, 

1576 preserve_path=preserve_path, 

1577 overwrite=overwrite, 

1578 ) 

1579 

1580 def exists( 

1581 self, 

1582 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1583 /, 

1584 data_id: DataId | None = None, 

1585 *, 

1586 full_check: bool = True, 

1587 collections: Any = None, 

1588 **kwargs: Any, 

1589 ) -> DatasetExistence: 

1590 """Indicate whether a dataset is known to Butler registry and 

1591 datastore. 

1592 

1593 Parameters 

1594 ---------- 

1595 dataset_ref_or_type : `DatasetRef`, `DatasetType`, or `str` 

1596 When `DatasetRef` the `dataId` should be `None`. 

1597 Otherwise the `DatasetType` or name thereof. 

1598 data_id : `dict` or `DataCoordinate` 

1599 A `dict` of `Dimension` link name, value pairs that label the 

1600 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1601 should be provided as the first argument. 

1602 full_check : `bool`, optional 

1603 If `True`, an additional check will be made for dataset artifact 

1604 existence. This will involve additional overhead due to the need 

1605 to query an external system. If `False` registry and datastore 

1606 will solely be asked if they know about the dataset but no 

1607 check for the artifact will be performed. 

1608 collections : Any, optional 

1609 Collections to be searched, overriding ``self.collections``. 

1610 Can be any of the types supported by the ``collections`` argument 

1611 to butler construction. 

1612 **kwargs 

1613 Additional keyword arguments used to augment or construct a 

1614 `DataCoordinate`. See `DataCoordinate.standardize` 

1615 parameters. 

1616 

1617 Returns 

1618 ------- 

1619 existence : `DatasetExistence` 

1620 Object indicating whether the dataset is known to registry and 

1621 datastore. Evaluates to `True` if the dataset is present and known 

1622 to both. 

1623 """ 

1624 existence = DatasetExistence.UNRECOGNIZED 

1625 

1626 if isinstance(dataset_ref_or_type, DatasetRef): 

1627 if collections is not None: 

1628 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1629 if data_id is not None: 

1630 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1631 ref = dataset_ref_or_type 

1632 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1633 if registry_ref is not None: 

1634 existence |= DatasetExistence.RECORDED 

1635 

1636 if dataset_ref_or_type != registry_ref: 

1637 # This could mean that storage classes differ, so we should 

1638 # check for that but use the registry ref for the rest of 

1639 # the method. 

1640 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1641 # Use the registry version from now on. 

1642 ref = registry_ref 

1643 else: 

1644 raise ValueError( 

1645 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1646 f"in registry but has different incompatible values ({registry_ref})." 

1647 ) 

1648 else: 

1649 try: 

1650 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1651 except (LookupError, TypeError, NoDefaultCollectionError): 

1652 return existence 

1653 existence |= DatasetExistence.RECORDED 

1654 

1655 if self._datastore.knows(ref): 

1656 existence |= DatasetExistence.DATASTORE 

1657 

1658 if full_check: 

1659 if self._datastore.exists(ref): 

1660 existence |= DatasetExistence._ARTIFACT 

1661 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1662 # Do not add this flag if we have no other idea about a dataset. 

1663 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1664 

1665 return existence 

1666 

1667 def _exists_many( 

1668 self, 

1669 refs: Iterable[DatasetRef], 

1670 /, 

1671 *, 

1672 full_check: bool = True, 

1673 ) -> dict[DatasetRef, DatasetExistence]: 

1674 """Indicate whether multiple datasets are known to Butler registry and 

1675 datastore. 

1676 

1677 This is an experimental API that may change at any moment. 

1678 

1679 Parameters 

1680 ---------- 

1681 refs : iterable of `DatasetRef` 

1682 The datasets to be checked. 

1683 full_check : `bool`, optional 

1684 If `True`, an additional check will be made for dataset artifact 

1685 existence. This will involve additional overhead due to the need 

1686 to query an external system. If `False` registry and datastore 

1687 will solely be asked if they know about the dataset but no 

1688 check for the artifact will be performed. 

1689 

1690 Returns 

1691 ------- 

1692 existence : dict of [`DatasetRef`, `DatasetExistence`] 

1693 Mapping from the given dataset refs to an enum indicating the 

1694 status of the dataset in registry and datastore. 

1695 Each value evaluates to `True` if the dataset is present and known 

1696 to both. 

1697 """ 

1698 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1699 

1700 # Registry does not have a bulk API to check for a ref. 

1701 for ref in refs: 

1702 registry_ref = self._registry.getDataset(ref.id) 

1703 if registry_ref is not None: 

1704 # It is possible, albeit unlikely, that the given ref does 

1705 # not match the one in registry even though the UUID matches. 

1706 # When checking a single ref we raise, but it's impolite to 

1707 # do that when potentially hundreds of refs are being checked. 

1708 # We could change the API to only accept UUIDs and that would 

1709 # remove the ability to even check and remove the worry 

1710 # about differing storage classes. Given the ongoing discussion 

1711 # on refs vs UUIDs and whether to raise or have a new 

1712 # private flag, treat this as a private API for now. 

1713 existence[ref] |= DatasetExistence.RECORDED 

1714 

1715 # Ask datastore if it knows about these refs. 

1716 knows = self._datastore.knows_these(refs) 

1717 for ref, known in knows.items(): 

1718 if known: 

1719 existence[ref] |= DatasetExistence.DATASTORE 

1720 

1721 if full_check: 

1722 mexists = self._datastore.mexists(refs) 

1723 for ref, exists in mexists.items(): 

1724 if exists: 

1725 existence[ref] |= DatasetExistence._ARTIFACT 

1726 else: 

1727 # Do not set this flag if nothing is known about the dataset. 

1728 for ref in existence.keys(): 

1729 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1730 existence[ref] |= DatasetExistence._ASSUMED 

1731 

1732 return existence 

1733 

1734 @deprecated( 

1735 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v27.0.", 

1736 version="v26.0", 

1737 category=FutureWarning, 

1738 ) 

1739 def datasetExists( 

1740 self, 

1741 datasetRefOrType: DatasetRef | DatasetType | str, 

1742 dataId: DataId | None = None, 

1743 *, 

1744 collections: Any = None, 

1745 **kwargs: Any, 

1746 ) -> bool: 

1747 """Return True if the Dataset is actually present in the Datastore. 

1748 

1749 Parameters 

1750 ---------- 

1751 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1752 When `DatasetRef` the `dataId` should be `None`. 

1753 Otherwise the `DatasetType` or name thereof. 

1754 dataId : `dict` or `DataCoordinate` 

1755 A `dict` of `Dimension` link name, value pairs that label the 

1756 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1757 should be provided as the first argument. 

1758 collections : Any, optional 

1759 Collections to be searched, overriding ``self.collections``. 

1760 Can be any of the types supported by the ``collections`` argument 

1761 to butler construction. 

1762 **kwargs 

1763 Additional keyword arguments used to augment or construct a 

1764 `DataCoordinate`. See `DataCoordinate.standardize` 

1765 parameters. 

1766 

1767 Raises 

1768 ------ 

1769 LookupError 

1770 Raised if the dataset is not even present in the Registry. 

1771 ValueError 

1772 Raised if a resolved `DatasetRef` was passed as an input, but it 

1773 differs from the one found in the registry. 

1774 NoDefaultCollectionError 

1775 Raised if no collections were provided. 

1776 """ 

1777 # A resolved ref may be given that is not known to this butler. 

1778 if isinstance(datasetRefOrType, DatasetRef): 

1779 ref = self._registry.getDataset(datasetRefOrType.id) 

1780 if ref is None: 

1781 raise LookupError( 

1782 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1783 ) 

1784 else: 

1785 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1786 return self._datastore.exists(ref) 

1787 

1788 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1789 """Remove one or more `~CollectionType.RUN` collections and the 

1790 datasets within them. 

1791 

1792 Parameters 

1793 ---------- 

1794 names : `~collections.abc.Iterable` [ `str` ] 

1795 The names of the collections to remove. 

1796 unstore : `bool`, optional 

1797 If `True` (default), delete datasets from all datastores in which 

1798 they are present, and attempt to rollback the registry deletions if 

1799 datastore deletions fail (which may not always be possible). If 

1800 `False`, datastore records for these datasets are still removed, 

1801 but any artifacts (e.g. files) will not be. 

1802 

1803 Raises 

1804 ------ 

1805 TypeError 

1806 Raised if one or more collections are not of type 

1807 `~CollectionType.RUN`. 

1808 """ 

1809 if not self.isWriteable(): 

1810 raise TypeError("Butler is read-only.") 

1811 names = list(names) 

1812 refs: list[DatasetRef] = [] 

1813 for name in names: 

1814 collectionType = self._registry.getCollectionType(name) 

1815 if collectionType is not CollectionType.RUN: 

1816 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1817 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1818 with self._datastore.transaction(): 

1819 with self._registry.transaction(): 

1820 if unstore: 

1821 self._datastore.trash(refs) 

1822 else: 

1823 self._datastore.forget(refs) 

1824 for name in names: 

1825 self._registry.removeCollection(name) 

1826 if unstore: 

1827 # Point of no return for removing artifacts 

1828 self._datastore.emptyTrash() 

1829 

1830 def pruneDatasets( 

1831 self, 

1832 refs: Iterable[DatasetRef], 

1833 *, 

1834 disassociate: bool = True, 

1835 unstore: bool = False, 

1836 tags: Iterable[str] = (), 

1837 purge: bool = False, 

1838 ) -> None: 

1839 # docstring inherited from LimitedButler 

1840 

1841 if not self.isWriteable(): 

1842 raise TypeError("Butler is read-only.") 

1843 if purge: 

1844 if not disassociate: 

1845 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1846 if not unstore: 

1847 raise TypeError("Cannot pass purge=True without unstore=True.") 

1848 elif disassociate: 

1849 tags = tuple(tags) 

1850 if not tags: 

1851 raise TypeError("No tags provided but disassociate=True.") 

1852 for tag in tags: 

1853 collectionType = self._registry.getCollectionType(tag) 

1854 if collectionType is not CollectionType.TAGGED: 

1855 raise TypeError( 

1856 f"Cannot disassociate from collection '{tag}' " 

1857 f"of non-TAGGED type {collectionType.name}." 

1858 ) 

1859 # Transform possibly-single-pass iterable into something we can iterate 

1860 # over multiple times. 

1861 refs = list(refs) 

1862 # Pruning a component of a DatasetRef makes no sense since registry 

1863 # doesn't know about components and datastore might not store 

1864 # components in a separate file 

1865 for ref in refs: 

1866 if ref.datasetType.component(): 

1867 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1868 # We don't need an unreliable Datastore transaction for this, because 

1869 # we've been extra careful to ensure that Datastore.trash only involves 

1870 # mutating the Registry (it can _look_ at Datastore-specific things, 

1871 # but shouldn't change them), and hence all operations here are 

1872 # Registry operations. 

1873 with self._datastore.transaction(): 

1874 with self._registry.transaction(): 

1875 if unstore: 

1876 self._datastore.trash(refs) 

1877 if purge: 

1878 self._registry.removeDatasets(refs) 

1879 elif disassociate: 

1880 assert tags, "Guaranteed by earlier logic in this function." 

1881 for tag in tags: 

1882 self._registry.disassociate(tag, refs) 

1883 # We've exited the Registry transaction, and apparently committed. 

1884 # (if there was an exception, everything rolled back, and it's as if 

1885 # nothing happened - and we never get here). 

1886 # Datastore artifacts are not yet gone, but they're clearly marked 

1887 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1888 # problems we can try again later, and if manual administrative 

1889 # intervention is required, it's pretty clear what that should entail: 

1890 # deleting everything on disk and in private Datastore tables that is 

1891 # in the dataset_location_trash table. 

1892 if unstore: 

1893 # Point of no return for removing artifacts 

1894 self._datastore.emptyTrash() 

1895 

1896 @transactional 

1897 def ingest( 

1898 self, 

1899 *datasets: FileDataset, 

1900 transfer: str | None = "auto", 

1901 run: str | None = None, 

1902 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1903 record_validation_info: bool = True, 

1904 ) -> None: 

1905 """Store and register one or more datasets that already exist on disk. 

1906 

1907 Parameters 

1908 ---------- 

1909 datasets : `FileDataset` 

1910 Each positional argument is a struct containing information about 

1911 a file to be ingested, including its URI (either absolute or 

1912 relative to the datastore root, if applicable), a resolved 

1913 `DatasetRef`, and optionally a formatter class or its 

1914 fully-qualified string name. If a formatter is not provided, the 

1915 formatter that would be used for `put` is assumed. On successful 

1916 ingest all `FileDataset.formatter` attributes will be set to the 

1917 formatter class used. `FileDataset.path` attributes may be modified 

1918 to put paths in whatever the datastore considers a standardized 

1919 form. 

1920 transfer : `str`, optional 

1921 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1922 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1923 transfer the file. 

1924 run : `str`, optional 

1925 The name of the run ingested datasets should be added to, 

1926 overriding ``self.run``. This parameter is now deprecated since 

1927 the run is encoded in the ``FileDataset``. 

1928 idGenerationMode : `DatasetIdGenEnum`, optional 

1929 Specifies option for generating dataset IDs. By default unique IDs 

1930 are generated for each inserted dataset. 

1931 record_validation_info : `bool`, optional 

1932 If `True`, the default, the datastore can record validation 

1933 information associated with the file. If `False` the datastore 

1934 will not attempt to track any information such as checksums 

1935 or file sizes. This can be useful if such information is tracked 

1936 in an external system or if the file is to be compressed in place. 

1937 It is up to the datastore whether this parameter is relevant. 

1938 

1939 Raises 

1940 ------ 

1941 TypeError 

1942 Raised if the butler is read-only or if no run was provided. 

1943 NotImplementedError 

1944 Raised if the `Datastore` does not support the given transfer mode. 

1945 DatasetTypeNotSupportedError 

1946 Raised if one or more files to be ingested have a dataset type that 

1947 is not supported by the `Datastore`.. 

1948 FileNotFoundError 

1949 Raised if one of the given files does not exist. 

1950 FileExistsError 

1951 Raised if transfer is not `None` but the (internal) location the 

1952 file would be moved to is already occupied. 

1953 

1954 Notes 

1955 ----- 

1956 This operation is not fully exception safe: if a database operation 

1957 fails, the given `FileDataset` instances may be only partially updated. 

1958 

1959 It is atomic in terms of database operations (they will either all 

1960 succeed or all fail) providing the database engine implements 

1961 transactions correctly. It will attempt to be atomic in terms of 

1962 filesystem operations as well, but this cannot be implemented 

1963 rigorously for most datastores. 

1964 """ 

1965 if not self.isWriteable(): 

1966 raise TypeError("Butler is read-only.") 

1967 

1968 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1969 if not datasets: 

1970 return 

1971 

1972 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1973 

1974 # We need to reorganize all the inputs so that they are grouped 

1975 # by dataset type and run. Multiple refs in a single FileDataset 

1976 # are required to share the run and dataset type. 

1977 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

1978 groupedData: GroupedData = defaultdict(list) 

1979 

1980 # Track DataIDs that are being ingested so we can spot issues early 

1981 # with duplication. Retain previous FileDataset so we can report it. 

1982 groupedDataIds: MutableMapping[ 

1983 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1984 ] = defaultdict(dict) 

1985 

1986 used_run = False 

1987 

1988 # And the nested loop that populates it: 

1989 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1990 # Somewhere to store pre-existing refs if we have an 

1991 # execution butler. 

1992 existingRefs: list[DatasetRef] = [] 

1993 

1994 for ref in dataset.refs: 

1995 assert ref.run is not None # For mypy 

1996 group_key = (ref.datasetType, ref.run) 

1997 

1998 if ref.dataId in groupedDataIds[group_key]: 

1999 raise ConflictingDefinitionError( 

2000 f"Ingest conflict. Dataset {dataset.path} has same" 

2001 " DataId as other ingest dataset" 

2002 f" {groupedDataIds[group_key][ref.dataId].path} " 

2003 f" ({ref.dataId})" 

2004 ) 

2005 

2006 groupedDataIds[group_key][ref.dataId] = dataset 

2007 

2008 if existingRefs: 

2009 if len(dataset.refs) != len(existingRefs): 

2010 # Keeping track of partially pre-existing datasets is hard 

2011 # and should generally never happen. For now don't allow 

2012 # it. 

2013 raise ConflictingDefinitionError( 

2014 f"For dataset {dataset.path} some dataIds already exist" 

2015 " in registry but others do not. This is not supported." 

2016 ) 

2017 

2018 # Store expanded form in the original FileDataset. 

2019 dataset.refs = existingRefs 

2020 else: 

2021 groupedData[group_key].append(dataset) 

2022 

2023 if not used_run and run is not None: 

2024 warnings.warn( 

2025 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

2026 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

2027 category=FutureWarning, 

2028 stacklevel=3, # Take into account the @transactional decorator. 

2029 ) 

2030 

2031 # Now we can bulk-insert into Registry for each DatasetType. 

2032 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

2033 groupedData.items(), desc="Bulk-inserting datasets by type" 

2034 ): 

2035 refs_to_import = [] 

2036 for dataset in grouped_datasets: 

2037 refs_to_import.extend(dataset.refs) 

2038 

2039 n_refs = len(refs_to_import) 

2040 log.verbose( 

2041 "Importing %d ref%s of dataset type %r into run %r", 

2042 n_refs, 

2043 "" if n_refs == 1 else "s", 

2044 datasetType.name, 

2045 this_run, 

2046 ) 

2047 

2048 # Import the refs and expand the DataCoordinates since we can't 

2049 # guarantee that they are expanded and Datastore will need 

2050 # the records. 

2051 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

2052 assert set(imported_refs) == set(refs_to_import) 

2053 

2054 # Replace all the refs in the FileDataset with expanded versions. 

2055 # Pull them off in the order we put them on the list. 

2056 for dataset in grouped_datasets: 

2057 n_dataset_refs = len(dataset.refs) 

2058 dataset.refs = imported_refs[:n_dataset_refs] 

2059 del imported_refs[:n_dataset_refs] 

2060 

2061 # Bulk-insert everything into Datastore. 

2062 # We do not know if any of the registry entries already existed 

2063 # (_importDatasets only complains if they exist but differ) so 

2064 # we have to catch IntegrityError explicitly. 

2065 try: 

2066 self._datastore.ingest( 

2067 *datasets, transfer=transfer, record_validation_info=record_validation_info 

2068 ) 

2069 except IntegrityError as e: 

2070 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") 

2071 

2072 @contextlib.contextmanager 

2073 def export( 

2074 self, 

2075 *, 

2076 directory: str | None = None, 

2077 filename: str | None = None, 

2078 format: str | None = None, 

2079 transfer: str | None = None, 

2080 ) -> Iterator[RepoExportContext]: 

2081 """Export datasets from the repository represented by this `Butler`. 

2082 

2083 This method is a context manager that returns a helper object 

2084 (`RepoExportContext`) that is used to indicate what information from 

2085 the repository should be exported. 

2086 

2087 Parameters 

2088 ---------- 

2089 directory : `str`, optional 

2090 Directory dataset files should be written to if ``transfer`` is not 

2091 `None`. 

2092 filename : `str`, optional 

2093 Name for the file that will include database information associated 

2094 with the exported datasets. If this is not an absolute path and 

2095 ``directory`` is not `None`, it will be written to ``directory`` 

2096 instead of the current working directory. Defaults to 

2097 "export.{format}". 

2098 format : `str`, optional 

2099 File format for the database information file. If `None`, the 

2100 extension of ``filename`` will be used. 

2101 transfer : `str`, optional 

2102 Transfer mode passed to `Datastore.export`. 

2103 

2104 Raises 

2105 ------ 

2106 TypeError 

2107 Raised if the set of arguments passed is inconsistent. 

2108 

2109 Examples 

2110 -------- 

2111 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2112 methods are used to provide the iterables over data IDs and/or datasets 

2113 to be exported:: 

2114 

2115 with butler.export("exports.yaml") as export: 

2116 # Export all flats, but none of the dimension element rows 

2117 # (i.e. data ID information) associated with them. 

2118 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2119 elements=()) 

2120 # Export all datasets that start with "deepCoadd_" and all of 

2121 # their associated data ID information. 

2122 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2123 """ 

2124 if directory is None and transfer is not None: 

2125 raise TypeError("Cannot transfer without providing a directory.") 

2126 if transfer == "move": 

2127 raise TypeError("Transfer may not be 'move': export is read-only") 

2128 if format is None: 

2129 if filename is None: 

2130 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2131 else: 

2132 _, format = os.path.splitext(filename) 

2133 if not format: 

2134 raise ValueError("Please specify a file extension to determine export format.") 

2135 format = format[1:] # Strip leading "."" 

2136 elif filename is None: 

2137 filename = f"export.{format}" 

2138 if directory is not None: 

2139 filename = os.path.join(directory, filename) 

2140 formats = self._config["repo_transfer_formats"] 

2141 if format not in formats: 

2142 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2143 BackendClass = get_class_of(formats[format, "export"]) 

2144 with open(filename, "w") as stream: 

2145 backend = BackendClass(stream, universe=self.dimensions) 

2146 try: 

2147 helper = RepoExportContext( 

2148 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

2149 ) 

2150 yield helper 

2151 except BaseException: 

2152 raise 

2153 else: 

2154 helper._finish() 

2155 

2156 def import_( 

2157 self, 

2158 *, 

2159 directory: ResourcePathExpression | None = None, 

2160 filename: ResourcePathExpression | TextIO | None = None, 

2161 format: str | None = None, 

2162 transfer: str | None = None, 

2163 skip_dimensions: set | None = None, 

2164 ) -> None: 

2165 """Import datasets into this repository that were exported from a 

2166 different butler repository via `~lsst.daf.butler.Butler.export`. 

2167 

2168 Parameters 

2169 ---------- 

2170 directory : `~lsst.resources.ResourcePathExpression`, optional 

2171 Directory containing dataset files to import from. If `None`, 

2172 ``filename`` and all dataset file paths specified therein must 

2173 be absolute. 

2174 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2175 A stream or name of file that contains database information 

2176 associated with the exported datasets, typically generated by 

2177 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2178 `~lsst.resources.ResourcePath` and is not an absolute path, 

2179 it will first be looked for relative to ``directory`` and if not 

2180 found there it will be looked for in the current working 

2181 directory. Defaults to "export.{format}". 

2182 format : `str`, optional 

2183 File format for ``filename``. If `None`, the extension of 

2184 ``filename`` will be used. 

2185 transfer : `str`, optional 

2186 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2187 skip_dimensions : `set`, optional 

2188 Names of dimensions that should be skipped and not imported. 

2189 

2190 Raises 

2191 ------ 

2192 TypeError 

2193 Raised if the set of arguments passed is inconsistent, or if the 

2194 butler is read-only. 

2195 """ 

2196 if not self.isWriteable(): 

2197 raise TypeError("Butler is read-only.") 

2198 if format is None: 

2199 if filename is None: 

2200 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2201 else: 

2202 _, format = os.path.splitext(filename) # type: ignore 

2203 elif filename is None: 

2204 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2205 if directory is not None: 

2206 directory = ResourcePath(directory, forceDirectory=True) 

2207 # mypy doesn't think this will work but it does in python >= 3.10. 

2208 if isinstance(filename, ResourcePathExpression): # type: ignore 

2209 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2210 if not filename.isabs() and directory is not None: 

2211 potential = directory.join(filename) 

2212 exists_in_cwd = filename.exists() 

2213 exists_in_dir = potential.exists() 

2214 if exists_in_cwd and exists_in_dir: 

2215 log.warning( 

2216 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2217 "Additionally, the file exists relative to the given search directory (%s). " 

2218 "Using the export file in the given directory.", 

2219 filename, 

2220 potential, 

2221 ) 

2222 # Given they specified an explicit directory and that 

2223 # directory has the export file in it, assume that that 

2224 # is what was meant despite the file in cwd. 

2225 filename = potential 

2226 elif exists_in_dir: 

2227 filename = potential 

2228 elif not exists_in_cwd and not exists_in_dir: 

2229 # Raise early. 

2230 raise FileNotFoundError( 

2231 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2232 ) 

2233 BackendClass: type[RepoImportBackend] = get_class_of( 

2234 self._config["repo_transfer_formats"][format]["import"] 

2235 ) 

2236 

2237 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2238 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

2239 backend.register() 

2240 with self.transaction(): 

2241 backend.load( 

2242 self._datastore, 

2243 directory=directory, 

2244 transfer=transfer, 

2245 skip_dimensions=skip_dimensions, 

2246 ) 

2247 

2248 if isinstance(filename, ResourcePath): 

2249 # We can not use open() here at the moment because of 

2250 # DM-38589 since yaml does stream.read(8192) in a loop. 

2251 stream = io.StringIO(filename.read().decode()) 

2252 doImport(stream) 

2253 else: 

2254 doImport(filename) # type: ignore 

2255 

2256 def transfer_from( 

2257 self, 

2258 source_butler: LimitedButler, 

2259 source_refs: Iterable[DatasetRef], 

2260 transfer: str = "auto", 

2261 skip_missing: bool = True, 

2262 register_dataset_types: bool = False, 

2263 transfer_dimensions: bool = False, 

2264 ) -> collections.abc.Collection[DatasetRef]: 

2265 """Transfer datasets to this Butler from a run in another Butler. 

2266 

2267 Parameters 

2268 ---------- 

2269 source_butler : `LimitedButler` 

2270 Butler from which the datasets are to be transferred. If data IDs 

2271 in ``source_refs`` are not expanded then this has to be a full 

2272 `Butler` whose registry will be used to expand data IDs. 

2273 source_refs : iterable of `DatasetRef` 

2274 Datasets defined in the source butler that should be transferred to 

2275 this butler. 

2276 transfer : `str`, optional 

2277 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2278 skip_missing : `bool` 

2279 If `True`, datasets with no datastore artifact associated with 

2280 them are not transferred. If `False` a registry entry will be 

2281 created even if no datastore record is created (and so will 

2282 look equivalent to the dataset being unstored). 

2283 register_dataset_types : `bool` 

2284 If `True` any missing dataset types are registered. Otherwise 

2285 an exception is raised. 

2286 transfer_dimensions : `bool`, optional 

2287 If `True`, dimension record data associated with the new datasets 

2288 will be transferred. 

2289 

2290 Returns 

2291 ------- 

2292 refs : `list` of `DatasetRef` 

2293 The refs added to this Butler. 

2294 

2295 Notes 

2296 ----- 

2297 The datastore artifact has to exist for a transfer 

2298 to be made but non-existence is not an error. 

2299 

2300 Datasets that already exist in this run will be skipped. 

2301 

2302 The datasets are imported as part of a transaction, although 

2303 dataset types are registered before the transaction is started. 

2304 This means that it is possible for a dataset type to be registered 

2305 even though transfer has failed. 

2306 """ 

2307 if not self.isWriteable(): 

2308 raise TypeError("Butler is read-only.") 

2309 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2310 

2311 # Will iterate through the refs multiple times so need to convert 

2312 # to a list if this isn't a collection. 

2313 if not isinstance(source_refs, collections.abc.Collection): 

2314 source_refs = list(source_refs) 

2315 

2316 original_count = len(source_refs) 

2317 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2318 

2319 # In some situations the datastore artifact may be missing 

2320 # and we do not want that registry entry to be imported. 

2321 # Asking datastore is not sufficient, the records may have been 

2322 # purged, we have to ask for the (predicted) URI and check 

2323 # existence explicitly. Execution butler is set up exactly like 

2324 # this with no datastore records. 

2325 artifact_existence: dict[ResourcePath, bool] = {} 

2326 if skip_missing: 

2327 dataset_existence = source_butler._datastore.mexists( 

2328 source_refs, artifact_existence=artifact_existence 

2329 ) 

2330 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2331 filtered_count = len(source_refs) 

2332 n_missing = original_count - filtered_count 

2333 log.verbose( 

2334 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2335 n_missing, 

2336 "" if n_missing == 1 else "s", 

2337 filtered_count, 

2338 ) 

2339 

2340 # Importing requires that we group the refs by dataset type and run 

2341 # before doing the import. 

2342 source_dataset_types = set() 

2343 grouped_refs = defaultdict(list) 

2344 for ref in source_refs: 

2345 grouped_refs[ref.datasetType, ref.run].append(ref) 

2346 source_dataset_types.add(ref.datasetType) 

2347 

2348 # Check to see if the dataset type in the source butler has 

2349 # the same definition in the target butler and register missing 

2350 # ones if requested. Registration must happen outside a transaction. 

2351 newly_registered_dataset_types = set() 

2352 for datasetType in source_dataset_types: 

2353 if register_dataset_types: 

2354 # Let this raise immediately if inconsistent. Continuing 

2355 # on to find additional inconsistent dataset types 

2356 # might result in additional unwanted dataset types being 

2357 # registered. 

2358 if self._registry.registerDatasetType(datasetType): 

2359 newly_registered_dataset_types.add(datasetType) 

2360 else: 

2361 # If the dataset type is missing, let it fail immediately. 

2362 target_dataset_type = self._registry.getDatasetType(datasetType.name) 

2363 if target_dataset_type != datasetType: 

2364 raise ConflictingDefinitionError( 

2365 "Source butler dataset type differs from definition" 

2366 f" in target butler: {datasetType} !=" 

2367 f" {target_dataset_type}" 

2368 ) 

2369 if newly_registered_dataset_types: 

2370 # We may have registered some even if there were inconsistencies 

2371 # but should let people know (or else remove them again). 

2372 log.log( 

2373 VERBOSE, 

2374 "Registered the following dataset types in the target Butler: %s", 

2375 ", ".join(d.name for d in newly_registered_dataset_types), 

2376 ) 

2377 else: 

2378 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2379 

2380 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2381 if transfer_dimensions: 

2382 # Collect all the dimension records for these refs. 

2383 # All dimensions are to be copied but the list of valid dimensions 

2384 # come from this butler's universe. 

2385 elements = frozenset( 

2386 element 

2387 for element in self.dimensions.getStaticElements() 

2388 if element.hasTable() and element.viewOf is None 

2389 ) 

2390 dataIds = {ref.dataId for ref in source_refs} 

2391 # This logic comes from saveDataIds. 

2392 for dataId in dataIds: 

2393 # Need an expanded record, if not expanded that we need a full 

2394 # butler with registry (allow mocks with registry too). 

2395 if not dataId.hasRecords(): 

2396 if registry := getattr(source_butler, "registry", None): 

2397 dataId = registry.expandDataId(dataId) 

2398 else: 

2399 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2400 # If this butler doesn't know about a dimension in the source 

2401 # butler things will break later. 

2402 for record in dataId.records.values(): 

2403 if record is not None and record.definition in elements: 

2404 dimension_records[record.definition].setdefault(record.dataId, record) 

2405 

2406 handled_collections: set[str] = set() 

2407 

2408 # Do all the importing in a single transaction. 

2409 with self.transaction(): 

2410 if dimension_records: 

2411 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2412 for element, r in dimension_records.items(): 

2413 records = [r[dataId] for dataId in r] 

2414 # Assume that if the record is already present that we can 

2415 # use it without having to check that the record metadata 

2416 # is consistent. 

2417 self._registry.insertDimensionData(element, *records, skip_existing=True) 

2418 

2419 n_imported = 0 

2420 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2421 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2422 ): 

2423 if run not in handled_collections: 

2424 # May need to create output collection. If source butler 

2425 # has a registry, ask for documentation string. 

2426 run_doc = None 

2427 if registry := getattr(source_butler, "registry", None): 

2428 run_doc = registry.getCollectionDocumentation(run) 

2429 registered = self._registry.registerRun(run, doc=run_doc) 

2430 handled_collections.add(run) 

2431 if registered: 

2432 log.log(VERBOSE, "Creating output run %s", run) 

2433 

2434 n_refs = len(refs_to_import) 

2435 log.verbose( 

2436 "Importing %d ref%s of dataset type %s into run %s", 

2437 n_refs, 

2438 "" if n_refs == 1 else "s", 

2439 datasetType.name, 

2440 run, 

2441 ) 

2442 

2443 # Assume we are using UUIDs and the source refs will match 

2444 # those imported. 

2445 imported_refs = self._registry._importDatasets(refs_to_import, expand=False) 

2446 assert set(imported_refs) == set(refs_to_import) 

2447 n_imported += len(imported_refs) 

2448 

2449 assert len(source_refs) == n_imported 

2450 log.verbose("Imported %d datasets into destination butler", n_imported) 

2451 

2452 # Ask the datastore to transfer. The datastore has to check that 

2453 # the source datastore is compatible with the target datastore. 

2454 accepted, rejected = self._datastore.transfer_from( 

2455 source_butler._datastore, 

2456 source_refs, 

2457 transfer=transfer, 

2458 artifact_existence=artifact_existence, 

2459 ) 

2460 if rejected: 

2461 # For now, accept the registry entries but not the files. 

2462 log.warning( 

2463 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2464 len(rejected), 

2465 len(accepted), 

2466 datasetType, 

2467 run, 

2468 ) 

2469 

2470 return source_refs 

2471 

2472 def validateConfiguration( 

2473 self, 

2474 logFailures: bool = False, 

2475 datasetTypeNames: Iterable[str] | None = None, 

2476 ignore: Iterable[str] | None = None, 

2477 ) -> None: 

2478 """Validate butler configuration. 

2479 

2480 Checks that each `DatasetType` can be stored in the `Datastore`. 

2481 

2482 Parameters 

2483 ---------- 

2484 logFailures : `bool`, optional 

2485 If `True`, output a log message for every validation error 

2486 detected. 

2487 datasetTypeNames : iterable of `str`, optional 

2488 The `DatasetType` names that should be checked. This allows 

2489 only a subset to be selected. 

2490 ignore : iterable of `str`, optional 

2491 Names of DatasetTypes to skip over. This can be used to skip 

2492 known problems. If a named `DatasetType` corresponds to a 

2493 composite, all components of that `DatasetType` will also be 

2494 ignored. 

2495 

2496 Raises 

2497 ------ 

2498 ButlerValidationError 

2499 Raised if there is some inconsistency with how this Butler 

2500 is configured. 

2501 """ 

2502 if datasetTypeNames: 

2503 datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames] 

2504 else: 

2505 datasetTypes = list(self._registry.queryDatasetTypes()) 

2506 

2507 # filter out anything from the ignore list 

2508 if ignore: 

2509 ignore = set(ignore) 

2510 datasetTypes = [ 

2511 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2512 ] 

2513 else: 

2514 ignore = set() 

2515 

2516 # For each datasetType that has an instrument dimension, create 

2517 # a DatasetRef for each defined instrument 

2518 datasetRefs = [] 

2519 

2520 # Find all the registered instruments (if "instrument" is in the 

2521 # universe). 

2522 if "instrument" in self.dimensions: 

2523 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2524 

2525 for datasetType in datasetTypes: 

2526 if "instrument" in datasetType.dimensions: 

2527 # In order to create a conforming dataset ref, create 

2528 # fake DataCoordinate values for the non-instrument 

2529 # dimensions. The type of the value does not matter here. 

2530 dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"} 

2531 

2532 for instrument in instruments: 

2533 datasetRef = DatasetRef( 

2534 datasetType, 

2535 DataCoordinate.standardize( 

2536 dataId, instrument=instrument, graph=datasetType.dimensions 

2537 ), 

2538 run="validate", 

2539 ) 

2540 datasetRefs.append(datasetRef) 

2541 

2542 entities: list[DatasetType | DatasetRef] = [] 

2543 entities.extend(datasetTypes) 

2544 entities.extend(datasetRefs) 

2545 

2546 datastoreErrorStr = None 

2547 try: 

2548 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2549 except ValidationError as e: 

2550 datastoreErrorStr = str(e) 

2551 

2552 # Also check that the LookupKeys used by the datastores match 

2553 # registry and storage class definitions 

2554 keys = self._datastore.getLookupKeys() 

2555 

2556 failedNames = set() 

2557 failedDataId = set() 

2558 for key in keys: 

2559 if key.name is not None: 

2560 if key.name in ignore: 

2561 continue 

2562 

2563 # skip if specific datasetType names were requested and this 

2564 # name does not match 

2565 if datasetTypeNames and key.name not in datasetTypeNames: 

2566 continue 

2567 

2568 # See if it is a StorageClass or a DatasetType 

2569 if key.name in self.storageClasses: 

2570 pass 

2571 else: 

2572 try: 

2573 self._registry.getDatasetType(key.name) 

2574 except KeyError: 

2575 if logFailures: 

2576 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2577 failedNames.add(key) 

2578 else: 

2579 # Dimensions are checked for consistency when the Butler 

2580 # is created and rendezvoused with a universe. 

2581 pass 

2582 

2583 # Check that the instrument is a valid instrument 

2584 # Currently only support instrument so check for that 

2585 if key.dataId: 

2586 dataIdKeys = set(key.dataId) 

2587 if {"instrument"} != dataIdKeys: 

2588 if logFailures: 

2589 log.critical("Key '%s' has unsupported DataId override", key) 

2590 failedDataId.add(key) 

2591 elif key.dataId["instrument"] not in instruments: 

2592 if logFailures: 

2593 log.critical("Key '%s' has unknown instrument", key) 

2594 failedDataId.add(key) 

2595 

2596 messages = [] 

2597 

2598 if datastoreErrorStr: 

2599 messages.append(datastoreErrorStr) 

2600 

2601 for failed, msg in ( 

2602 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2603 (failedDataId, "Keys with bad DataId entries: "), 

2604 ): 

2605 if failed: 

2606 msg += ", ".join(str(k) for k in failed) 

2607 messages.append(msg) 

2608 

2609 if messages: 

2610 raise ValidationError(";\n".join(messages)) 

2611 

2612 @property 

2613 def collections(self) -> Sequence[str]: 

2614 """The collections to search by default, in order 

2615 (`~collections.abc.Sequence` [ `str` ]). 

2616 

2617 This is an alias for ``self.registry.defaults.collections``. It cannot 

2618 be set directly in isolation, but all defaults may be changed together 

2619 by assigning a new `RegistryDefaults` instance to 

2620 ``self.registry.defaults``. 

2621 """ 

2622 return self._registry.defaults.collections 

2623 

2624 @property 

2625 def run(self) -> str | None: 

2626 """Name of the run this butler writes outputs to by default (`str` or 

2627 `None`). 

2628 

2629 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2630 directly in isolation, but all defaults may be changed together by 

2631 assigning a new `RegistryDefaults` instance to 

2632 ``self.registry.defaults``. 

2633 """ 

2634 return self._registry.defaults.run 

2635 

2636 @property 

2637 def registry(self) -> Registry: 

2638 """The object that manages dataset metadata and relationships 

2639 (`Registry`). 

2640 

2641 Many operations that don't involve reading or writing butler datasets 

2642 are accessible only via `Registry` methods. Eventually these methods 

2643 will be replaced by equivalent `Butler` methods. 

2644 """ 

2645 return self._registry_shim 

2646 

2647 @property 

2648 def dimensions(self) -> DimensionUniverse: 

2649 # Docstring inherited. 

2650 return self._registry.dimensions 

2651 

2652 _registry: _ButlerRegistry 

2653 """The object that manages dataset metadata and relationships 

2654 (`_ButlerRegistry`). 

2655 

2656 Most operations that don't involve reading or writing butler datasets are 

2657 accessible only via `Registry` methods. 

2658 """ 

2659 

2660 datastore: Datastore 

2661 """The object that manages actual dataset storage (`Datastore`). 

2662 

2663 Direct user access to the datastore should rarely be necessary; the primary 

2664 exception is the case where a `Datastore` implementation provides extra 

2665 functionality beyond what the base class defines. 

2666 """ 

2667 

2668 storageClasses: StorageClassFactory 

2669 """An object that maps known storage class names to objects that fully 

2670 describe them (`StorageClassFactory`). 

2671 """