Coverage for python/lsst/daf/butler/_butler.py: 8%

737 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-05 03:17 -0700

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30) 

31 

32import collections.abc 

33import contextlib 

34import io 

35import logging 

36import numbers 

37import os 

38import uuid 

39import warnings 

40from collections import defaultdict 

41from typing import ( 

42 TYPE_CHECKING, 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Sequence, 

53 Set, 

54 TextIO, 

55 Tuple, 

56 Type, 

57 Union, 

58) 

59 

60from deprecated.sphinx import deprecated 

61from lsst.resources import ResourcePath, ResourcePathExpression 

62from lsst.utils import doImportType 

63from lsst.utils.introspection import get_class_of 

64from lsst.utils.logging import VERBOSE, getLogger 

65from sqlalchemy.exc import IntegrityError 

66 

67from ._butlerConfig import ButlerConfig 

68from ._butlerRepoIndex import ButlerRepoIndex 

69from ._deferredDatasetHandle import DeferredDatasetHandle 

70from ._limited_butler import LimitedButler 

71from .core import ( 

72 AmbiguousDatasetError, 

73 Config, 

74 ConfigSubset, 

75 DataCoordinate, 

76 DataId, 

77 DataIdValue, 

78 DatasetIdFactory, 

79 DatasetIdGenEnum, 

80 DatasetRef, 

81 DatasetRefURIs, 

82 DatasetType, 

83 Datastore, 

84 Dimension, 

85 DimensionConfig, 

86 DimensionElement, 

87 DimensionRecord, 

88 DimensionUniverse, 

89 FileDataset, 

90 Progress, 

91 StorageClass, 

92 StorageClassFactory, 

93 Timespan, 

94 UnresolvedRefWarning, 

95 ValidationError, 

96) 

97from .core.repoRelocation import BUTLER_ROOT_TAG 

98from .core.utils import transactional 

99from .registry import ( 

100 CollectionType, 

101 ConflictingDefinitionError, 

102 DataIdError, 

103 MissingDatasetTypeError, 

104 Registry, 

105 RegistryConfig, 

106 RegistryDefaults, 

107) 

108from .transfers import RepoExportContext 

109 

110if TYPE_CHECKING: 

111 from lsst.resources import ResourceHandleProtocol 

112 

113log = getLogger(__name__) 

114 

115 

116class ButlerValidationError(ValidationError): 

117 """There is a problem with the Butler configuration.""" 

118 

119 pass 

120 

121 

122class Butler(LimitedButler): 

123 """Main entry point for the data access system. 

124 

125 Parameters 

126 ---------- 

127 config : `ButlerConfig`, `Config` or `str`, optional. 

128 Configuration. Anything acceptable to the 

129 `ButlerConfig` constructor. If a directory path 

130 is given the configuration will be read from a ``butler.yaml`` file in 

131 that location. If `None` is given default values will be used. 

132 butler : `Butler`, optional. 

133 If provided, construct a new Butler that uses the same registry and 

134 datastore as the given one, but with the given collection and run. 

135 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

136 arguments. 

137 collections : `str` or `Iterable` [ `str` ], optional 

138 An expression specifying the collections to be searched (in order) when 

139 reading datasets. 

140 This may be a `str` collection name or an iterable thereof. 

141 See :ref:`daf_butler_collection_expressions` for more information. 

142 These collections are not registered automatically and must be 

143 manually registered before they are used by any method, but they may be 

144 manually registered after the `Butler` is initialized. 

145 run : `str`, optional 

146 Name of the `~CollectionType.RUN` collection new datasets should be 

147 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

148 ``collections`` will be set to ``[run]``. If not `None`, this 

149 collection will automatically be registered. If this is not set (and 

150 ``writeable`` is not set either), a read-only butler will be created. 

151 searchPaths : `list` of `str`, optional 

152 Directory paths to search when calculating the full Butler 

153 configuration. Not used if the supplied config is already a 

154 `ButlerConfig`. 

155 writeable : `bool`, optional 

156 Explicitly sets whether the butler supports write operations. If not 

157 provided, a read-write butler is created if any of ``run``, ``tags``, 

158 or ``chains`` is non-empty. 

159 inferDefaults : `bool`, optional 

160 If `True` (default) infer default data ID values from the values 

161 present in the datasets in ``collections``: if all collections have the 

162 same value (or no value) for a governor dimension, that value will be 

163 the default for that dimension. Nonexistent collections are ignored. 

164 If a default value is provided explicitly for a governor dimension via 

165 ``**kwargs``, no default will be inferred for that dimension. 

166 **kwargs : `str` 

167 Default data ID key-value pairs. These may only identify "governor" 

168 dimensions like ``instrument`` and ``skymap``. 

169 

170 Examples 

171 -------- 

172 While there are many ways to control exactly how a `Butler` interacts with 

173 the collections in its `Registry`, the most common cases are still simple. 

174 

175 For a read-only `Butler` that searches one collection, do:: 

176 

177 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

178 

179 For a read-write `Butler` that writes to and reads from a 

180 `~CollectionType.RUN` collection:: 

181 

182 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

183 

184 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

185 because we want to write to one `~CollectionType.RUN` collection but read 

186 from several others (as well):: 

187 

188 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

189 collections=["u/alice/DM-50000/a", 

190 "u/bob/DM-49998", 

191 "HSC/defaults"]) 

192 

193 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

194 Datasets will be read first from that run (since it appears first in the 

195 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

196 

197 Finally, one can always create a `Butler` with no collections:: 

198 

199 butler = Butler("/path/to/repo", writeable=True) 

200 

201 This can be extremely useful when you just want to use ``butler.registry``, 

202 e.g. for inserting dimension data or managing collections, or when the 

203 collections you want to use with the butler are not consistent. 

204 Passing ``writeable`` explicitly here is only necessary if you want to be 

205 able to make changes to the repo - usually the value for ``writeable`` can 

206 be guessed from the collection arguments provided, but it defaults to 

207 `False` when there are not collection arguments. 

208 """ 

209 

210 def __init__( 

211 self, 

212 config: Union[Config, str, None] = None, 

213 *, 

214 butler: Optional[Butler] = None, 

215 collections: Any = None, 

216 run: Optional[str] = None, 

217 searchPaths: Optional[List[str]] = None, 

218 writeable: Optional[bool] = None, 

219 inferDefaults: bool = True, 

220 **kwargs: str, 

221 ): 

222 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

223 # Load registry, datastore, etc. from config or existing butler. 

224 if butler is not None: 

225 if config is not None or searchPaths is not None or writeable is not None: 

226 raise TypeError( 

227 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

228 ) 

229 self.registry = butler.registry.copy(defaults) 

230 self.datastore = butler.datastore 

231 self.storageClasses = butler.storageClasses 

232 self._config: ButlerConfig = butler._config 

233 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

234 else: 

235 # Can only look for strings in the known repos list. 

236 if isinstance(config, str) and config in self.get_known_repos(): 

237 config = str(self.get_repo_uri(config)) 

238 try: 

239 self._config = ButlerConfig(config, searchPaths=searchPaths) 

240 except FileNotFoundError as e: 

241 if known := self.get_known_repos(): 

242 aliases = f"(known aliases: {', '.join(known)})" 

243 else: 

244 aliases = "(no known aliases)" 

245 raise FileNotFoundError(f"{e} {aliases}") from e 

246 self._config = ButlerConfig(config, searchPaths=searchPaths) 

247 try: 

248 if "root" in self._config: 

249 butlerRoot = self._config["root"] 

250 else: 

251 butlerRoot = self._config.configDir 

252 if writeable is None: 

253 writeable = run is not None 

254 self.registry = Registry.fromConfig( 

255 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

256 ) 

257 self.datastore = Datastore.fromConfig( 

258 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

259 ) 

260 self.storageClasses = StorageClassFactory() 

261 self.storageClasses.addFromConfig(self._config) 

262 self._allow_put_of_predefined_dataset = self._config.get( 

263 "allow_put_of_predefined_dataset", False 

264 ) 

265 except Exception: 

266 # Failures here usually mean that configuration is incomplete, 

267 # just issue an error message which includes config file URI. 

268 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

269 raise 

270 

271 # For execution butler the datastore needs a special 

272 # dependency-inversion trick. This is not used by regular butler, 

273 # but we do not have a way to distinguish regular butler from execution 

274 # butler. 

275 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

276 

277 if "run" in self._config or "collection" in self._config: 

278 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

279 

280 GENERATION: ClassVar[int] = 3 

281 """This is a Generation 3 Butler. 

282 

283 This attribute may be removed in the future, once the Generation 2 Butler 

284 interface has been fully retired; it should only be used in transitional 

285 code. 

286 """ 

287 

288 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

289 """Return DatasetType defined in registry given dataset type name.""" 

290 try: 

291 return self.registry.getDatasetType(name) 

292 except MissingDatasetTypeError: 

293 return None 

294 

295 @classmethod 

296 def get_repo_uri(cls, label: str) -> ResourcePath: 

297 """Look up the label in a butler repository index. 

298 

299 Parameters 

300 ---------- 

301 label : `str` 

302 Label of the Butler repository to look up. 

303 

304 Returns 

305 ------- 

306 uri : `lsst.resources.ResourcePath` 

307 URI to the Butler repository associated with the given label. 

308 

309 Raises 

310 ------ 

311 KeyError 

312 Raised if the label is not found in the index, or if an index 

313 can not be found at all. 

314 

315 Notes 

316 ----- 

317 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

318 information is discovered. 

319 """ 

320 return ButlerRepoIndex.get_repo_uri(label) 

321 

322 @classmethod 

323 def get_known_repos(cls) -> Set[str]: 

324 """Retrieve the list of known repository labels. 

325 

326 Returns 

327 ------- 

328 repos : `set` of `str` 

329 All the known labels. Can be empty if no index can be found. 

330 

331 Notes 

332 ----- 

333 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

334 information is discovered. 

335 """ 

336 return ButlerRepoIndex.get_known_repos() 

337 

338 @staticmethod 

339 def makeRepo( 

340 root: ResourcePathExpression, 

341 config: Union[Config, str, None] = None, 

342 dimensionConfig: Union[Config, str, None] = None, 

343 standalone: bool = False, 

344 searchPaths: Optional[List[str]] = None, 

345 forceConfigRoot: bool = True, 

346 outfile: Optional[ResourcePathExpression] = None, 

347 overwrite: bool = False, 

348 ) -> Config: 

349 """Create an empty data repository by adding a butler.yaml config 

350 to a repository root directory. 

351 

352 Parameters 

353 ---------- 

354 root : `lsst.resources.ResourcePathExpression` 

355 Path or URI to the root location of the new repository. Will be 

356 created if it does not exist. 

357 config : `Config` or `str`, optional 

358 Configuration to write to the repository, after setting any 

359 root-dependent Registry or Datastore config options. Can not 

360 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

361 configuration will be used. Root-dependent config options 

362 specified in this config are overwritten if ``forceConfigRoot`` 

363 is `True`. 

364 dimensionConfig : `Config` or `str`, optional 

365 Configuration for dimensions, will be used to initialize registry 

366 database. 

367 standalone : `bool` 

368 If True, write all expanded defaults, not just customized or 

369 repository-specific settings. 

370 This (mostly) decouples the repository from the default 

371 configuration, insulating it from changes to the defaults (which 

372 may be good or bad, depending on the nature of the changes). 

373 Future *additions* to the defaults will still be picked up when 

374 initializing `Butlers` to repos created with ``standalone=True``. 

375 searchPaths : `list` of `str`, optional 

376 Directory paths to search when calculating the full butler 

377 configuration. 

378 forceConfigRoot : `bool`, optional 

379 If `False`, any values present in the supplied ``config`` that 

380 would normally be reset are not overridden and will appear 

381 directly in the output config. This allows non-standard overrides 

382 of the root directory for a datastore or registry to be given. 

383 If this parameter is `True` the values for ``root`` will be 

384 forced into the resulting config if appropriate. 

385 outfile : `lss.resources.ResourcePathExpression`, optional 

386 If not-`None`, the output configuration will be written to this 

387 location rather than into the repository itself. Can be a URI 

388 string. Can refer to a directory that will be used to write 

389 ``butler.yaml``. 

390 overwrite : `bool`, optional 

391 Create a new configuration file even if one already exists 

392 in the specified output location. Default is to raise 

393 an exception. 

394 

395 Returns 

396 ------- 

397 config : `Config` 

398 The updated `Config` instance written to the repo. 

399 

400 Raises 

401 ------ 

402 ValueError 

403 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

404 regular Config (as these subclasses would make it impossible to 

405 support ``standalone=False``). 

406 FileExistsError 

407 Raised if the output config file already exists. 

408 os.error 

409 Raised if the directory does not exist, exists but is not a 

410 directory, or cannot be created. 

411 

412 Notes 

413 ----- 

414 Note that when ``standalone=False`` (the default), the configuration 

415 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

416 construct the repository should also be used to construct any Butlers 

417 to avoid configuration inconsistencies. 

418 """ 

419 if isinstance(config, (ButlerConfig, ConfigSubset)): 

420 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

421 

422 # Ensure that the root of the repository exists or can be made 

423 root_uri = ResourcePath(root, forceDirectory=True) 

424 root_uri.mkdir() 

425 

426 config = Config(config) 

427 

428 # If we are creating a new repo from scratch with relative roots, 

429 # do not propagate an explicit root from the config file 

430 if "root" in config: 

431 del config["root"] 

432 

433 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

434 imported_class = doImportType(full["datastore", "cls"]) 

435 if not issubclass(imported_class, Datastore): 

436 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

437 datastoreClass: Type[Datastore] = imported_class 

438 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

439 

440 # if key exists in given config, parse it, otherwise parse the defaults 

441 # in the expanded config 

442 if config.get(("registry", "db")): 

443 registryConfig = RegistryConfig(config) 

444 else: 

445 registryConfig = RegistryConfig(full) 

446 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

447 if defaultDatabaseUri is not None: 

448 Config.updateParameters( 

449 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

450 ) 

451 else: 

452 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

453 

454 if standalone: 

455 config.merge(full) 

456 else: 

457 # Always expand the registry.managers section into the per-repo 

458 # config, because after the database schema is created, it's not 

459 # allowed to change anymore. Note that in the standalone=True 

460 # branch, _everything_ in the config is expanded, so there's no 

461 # need to special case this. 

462 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

463 configURI: ResourcePathExpression 

464 if outfile is not None: 

465 # When writing to a separate location we must include 

466 # the root of the butler repo in the config else it won't know 

467 # where to look. 

468 config["root"] = root_uri.geturl() 

469 configURI = outfile 

470 else: 

471 configURI = root_uri 

472 # Strip obscore configuration, if it is present, before writing config 

473 # to a file, obscore config will be stored in registry. 

474 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

475 config_to_write = config.copy() 

476 del config_to_write[obscore_config_key] 

477 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

478 # configFile attribute is updated, need to copy it to original. 

479 config.configFile = config_to_write.configFile 

480 else: 

481 config.dumpToUri(configURI, overwrite=overwrite) 

482 

483 # Create Registry and populate tables 

484 registryConfig = RegistryConfig(config.get("registry")) 

485 dimensionConfig = DimensionConfig(dimensionConfig) 

486 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

487 

488 log.verbose("Wrote new Butler configuration file to %s", configURI) 

489 

490 return config 

491 

492 @classmethod 

493 def _unpickle( 

494 cls, 

495 config: ButlerConfig, 

496 collections: Optional[tuple[str, ...]], 

497 run: Optional[str], 

498 defaultDataId: Dict[str, str], 

499 writeable: bool, 

500 ) -> Butler: 

501 """Callable used to unpickle a Butler. 

502 

503 We prefer not to use ``Butler.__init__`` directly so we can force some 

504 of its many arguments to be keyword-only (note that ``__reduce__`` 

505 can only invoke callables with positional arguments). 

506 

507 Parameters 

508 ---------- 

509 config : `ButlerConfig` 

510 Butler configuration, already coerced into a true `ButlerConfig` 

511 instance (and hence after any search paths for overrides have been 

512 utilized). 

513 collections : `tuple` [ `str` ] 

514 Names of the default collections to read from. 

515 run : `str`, optional 

516 Name of the default `~CollectionType.RUN` collection to write to. 

517 defaultDataId : `dict` [ `str`, `str` ] 

518 Default data ID values. 

519 writeable : `bool` 

520 Whether the Butler should support write operations. 

521 

522 Returns 

523 ------- 

524 butler : `Butler` 

525 A new `Butler` instance. 

526 """ 

527 # MyPy doesn't recognize that the kwargs below are totally valid; it 

528 # seems to think '**defaultDataId* is a _positional_ argument! 

529 return cls( 

530 config=config, 

531 collections=collections, 

532 run=run, 

533 writeable=writeable, 

534 **defaultDataId, # type: ignore 

535 ) 

536 

537 def __reduce__(self) -> tuple: 

538 """Support pickling.""" 

539 return ( 

540 Butler._unpickle, 

541 ( 

542 self._config, 

543 self.collections, 

544 self.run, 

545 self.registry.defaults.dataId.byName(), 

546 self.registry.isWriteable(), 

547 ), 

548 ) 

549 

550 def __str__(self) -> str: 

551 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

552 self.collections, self.run, self.datastore, self.registry 

553 ) 

554 

555 def isWriteable(self) -> bool: 

556 """Return `True` if this `Butler` supports write operations.""" 

557 return self.registry.isWriteable() 

558 

559 @contextlib.contextmanager 

560 def transaction(self) -> Iterator[None]: 

561 """Context manager supporting `Butler` transactions. 

562 

563 Transactions can be nested. 

564 """ 

565 with self.registry.transaction(): 

566 with self.datastore.transaction(): 

567 yield 

568 

569 def _standardizeArgs( 

570 self, 

571 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

572 dataId: Optional[DataId] = None, 

573 for_put: bool = True, 

574 **kwargs: Any, 

575 ) -> Tuple[DatasetType, Optional[DataId]]: 

576 """Standardize the arguments passed to several Butler APIs. 

577 

578 Parameters 

579 ---------- 

580 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

581 When `DatasetRef` the `dataId` should be `None`. 

582 Otherwise the `DatasetType` or name thereof. 

583 dataId : `dict` or `DataCoordinate` 

584 A `dict` of `Dimension` link name, value pairs that label the 

585 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

586 should be provided as the second argument. 

587 for_put : `bool`, optional 

588 If `True` this call is invoked as part of a `Butler.put()`. 

589 Otherwise it is assumed to be part of a `Butler.get()`. This 

590 parameter is only relevant if there is dataset type 

591 inconsistency. 

592 **kwargs 

593 Additional keyword arguments used to augment or construct a 

594 `DataCoordinate`. See `DataCoordinate.standardize` 

595 parameters. 

596 

597 Returns 

598 ------- 

599 datasetType : `DatasetType` 

600 A `DatasetType` instance extracted from ``datasetRefOrType``. 

601 dataId : `dict` or `DataId`, optional 

602 Argument that can be used (along with ``kwargs``) to construct a 

603 `DataId`. 

604 

605 Notes 

606 ----- 

607 Butler APIs that conceptually need a DatasetRef also allow passing a 

608 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

609 keyword arguments that can be used to construct one) separately. This 

610 method accepts those arguments and always returns a true `DatasetType` 

611 and a `DataId` or `dict`. 

612 

613 Standardization of `dict` vs `DataId` is best handled by passing the 

614 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

615 generally similarly flexible. 

616 """ 

617 externalDatasetType: Optional[DatasetType] = None 

618 internalDatasetType: Optional[DatasetType] = None 

619 if isinstance(datasetRefOrType, DatasetRef): 

620 if dataId is not None or kwargs: 

621 raise ValueError("DatasetRef given, cannot use dataId as well") 

622 externalDatasetType = datasetRefOrType.datasetType 

623 dataId = datasetRefOrType.dataId 

624 else: 

625 # Don't check whether DataId is provided, because Registry APIs 

626 # can usually construct a better error message when it wasn't. 

627 if isinstance(datasetRefOrType, DatasetType): 

628 externalDatasetType = datasetRefOrType 

629 else: 

630 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

631 

632 # Check that they are self-consistent 

633 if externalDatasetType is not None: 

634 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

635 if externalDatasetType != internalDatasetType: 

636 # We can allow differences if they are compatible, depending 

637 # on whether this is a get or a put. A get requires that 

638 # the python type associated with the datastore can be 

639 # converted to the user type. A put requires that the user 

640 # supplied python type can be converted to the internal 

641 # type expected by registry. 

642 relevantDatasetType = internalDatasetType 

643 if for_put: 

644 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

645 else: 

646 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

647 relevantDatasetType = externalDatasetType 

648 if not is_compatible: 

649 raise ValueError( 

650 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

651 f"registry definition ({internalDatasetType})" 

652 ) 

653 # Override the internal definition. 

654 internalDatasetType = relevantDatasetType 

655 

656 assert internalDatasetType is not None 

657 return internalDatasetType, dataId 

658 

659 def _rewrite_data_id( 

660 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

661 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

662 """Rewrite a data ID taking into account dimension records. 

663 

664 Take a Data ID and keyword args and rewrite it if necessary to 

665 allow the user to specify dimension records rather than dimension 

666 primary values. 

667 

668 This allows a user to include a dataId dict with keys of 

669 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

670 the integer exposure ID. It also allows a string to be given 

671 for a dimension value rather than the integer ID if that is more 

672 convenient. For example, rather than having to specifyin the 

673 detector with ``detector.full_name``, a string given for ``detector`` 

674 will be interpreted as the full name and converted to the integer 

675 value. 

676 

677 Keyword arguments can also use strings for dimensions like detector 

678 and exposure but python does not allow them to include ``.`` and 

679 so the ``exposure.day_obs`` syntax can not be used in a keyword 

680 argument. 

681 

682 Parameters 

683 ---------- 

684 dataId : `dict` or `DataCoordinate` 

685 A `dict` of `Dimension` link name, value pairs that will label the 

686 `DatasetRef` within a Collection. 

687 datasetType : `DatasetType` 

688 The dataset type associated with this dataId. Required to 

689 determine the relevant dimensions. 

690 **kwargs 

691 Additional keyword arguments used to augment or construct a 

692 `DataId`. See `DataId` parameters. 

693 

694 Returns 

695 ------- 

696 dataId : `dict` or `DataCoordinate` 

697 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

698 no keyword arguments, the original dataId will be returned 

699 unchanged. 

700 **kwargs : `dict` 

701 Any unused keyword arguments (would normally be empty dict). 

702 """ 

703 # Do nothing if we have a standalone DataCoordinate. 

704 if isinstance(dataId, DataCoordinate) and not kwargs: 

705 return dataId, kwargs 

706 

707 # Process dimension records that are using record information 

708 # rather than ids 

709 newDataId: Dict[str, DataIdValue] = {} 

710 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

711 

712 # if all the dataId comes from keyword parameters we do not need 

713 # to do anything here because they can't be of the form 

714 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

715 if dataId: 

716 for k, v in dataId.items(): 

717 # If we have a Dimension we do not need to do anything 

718 # because it cannot be a compound key. 

719 if isinstance(k, str) and "." in k: 

720 # Someone is using a more human-readable dataId 

721 dimensionName, record = k.split(".", 1) 

722 byRecord[dimensionName][record] = v 

723 elif isinstance(k, Dimension): 

724 newDataId[k.name] = v 

725 else: 

726 newDataId[k] = v 

727 

728 # Go through the updated dataId and check the type in case someone is 

729 # using an alternate key. We have already filtered out the compound 

730 # keys dimensions.record format. 

731 not_dimensions = {} 

732 

733 # Will need to look in the dataId and the keyword arguments 

734 # and will remove them if they need to be fixed or are unrecognized. 

735 for dataIdDict in (newDataId, kwargs): 

736 # Use a list so we can adjust the dict safely in the loop 

737 for dimensionName in list(dataIdDict): 

738 value = dataIdDict[dimensionName] 

739 try: 

740 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

741 except KeyError: 

742 # This is not a real dimension 

743 not_dimensions[dimensionName] = value 

744 del dataIdDict[dimensionName] 

745 continue 

746 

747 # Convert an integral type to an explicit int to simplify 

748 # comparisons here 

749 if isinstance(value, numbers.Integral): 

750 value = int(value) 

751 

752 if not isinstance(value, dimension.primaryKey.getPythonType()): 

753 for alternate in dimension.alternateKeys: 

754 if isinstance(value, alternate.getPythonType()): 

755 byRecord[dimensionName][alternate.name] = value 

756 del dataIdDict[dimensionName] 

757 log.debug( 

758 "Converting dimension %s to %s.%s=%s", 

759 dimensionName, 

760 dimensionName, 

761 alternate.name, 

762 value, 

763 ) 

764 break 

765 else: 

766 log.warning( 

767 "Type mismatch found for value '%r' provided for dimension %s. " 

768 "Could not find matching alternative (primary key has type %s) " 

769 "so attempting to use as-is.", 

770 value, 

771 dimensionName, 

772 dimension.primaryKey.getPythonType(), 

773 ) 

774 

775 # By this point kwargs and newDataId should only include valid 

776 # dimensions. Merge kwargs in to the new dataId and log if there 

777 # are dimensions in both (rather than calling update). 

778 for k, v in kwargs.items(): 

779 if k in newDataId and newDataId[k] != v: 

780 log.debug( 

781 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

782 ) 

783 newDataId[k] = v 

784 # No need to retain any values in kwargs now. 

785 kwargs = {} 

786 

787 # If we have some unrecognized dimensions we have to try to connect 

788 # them to records in other dimensions. This is made more complicated 

789 # by some dimensions having records with clashing names. A mitigation 

790 # is that we can tell by this point which dimensions are missing 

791 # for the DatasetType but this does not work for calibrations 

792 # where additional dimensions can be used to constrain the temporal 

793 # axis. 

794 if not_dimensions: 

795 # Search for all dimensions even if we have been given a value 

796 # explicitly. In some cases records are given as well as the 

797 # actually dimension and this should not be an error if they 

798 # match. 

799 mandatoryDimensions = datasetType.dimensions.names # - provided 

800 

801 candidateDimensions: Set[str] = set() 

802 candidateDimensions.update(mandatoryDimensions) 

803 

804 # For calibrations we may well be needing temporal dimensions 

805 # so rather than always including all dimensions in the scan 

806 # restrict things a little. It is still possible for there 

807 # to be confusion over day_obs in visit vs exposure for example. 

808 # If we are not searching calibration collections things may 

809 # fail but they are going to fail anyway because of the 

810 # ambiguousness of the dataId... 

811 if datasetType.isCalibration(): 

812 for dim in self.registry.dimensions.getStaticDimensions(): 

813 if dim.temporal: 

814 candidateDimensions.add(str(dim)) 

815 

816 # Look up table for the first association with a dimension 

817 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

818 

819 # Keep track of whether an item is associated with multiple 

820 # dimensions. 

821 counter: Counter[str] = Counter() 

822 assigned: Dict[str, Set[str]] = defaultdict(set) 

823 

824 # Go through the missing dimensions and associate the 

825 # given names with records within those dimensions 

826 matched_dims = set() 

827 for dimensionName in candidateDimensions: 

828 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

829 fields = dimension.metadata.names | dimension.uniqueKeys.names 

830 for field in not_dimensions: 

831 if field in fields: 

832 guessedAssociation[dimensionName][field] = not_dimensions[field] 

833 counter[dimensionName] += 1 

834 assigned[field].add(dimensionName) 

835 matched_dims.add(field) 

836 

837 # Calculate the fields that matched nothing. 

838 never_found = set(not_dimensions) - matched_dims 

839 

840 if never_found: 

841 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

842 

843 # There is a chance we have allocated a single dataId item 

844 # to multiple dimensions. Need to decide which should be retained. 

845 # For now assume that the most popular alternative wins. 

846 # This means that day_obs with seq_num will result in 

847 # exposure.day_obs and not visit.day_obs 

848 # Also prefer an explicitly missing dimension over an inferred 

849 # temporal dimension. 

850 for fieldName, assignedDimensions in assigned.items(): 

851 if len(assignedDimensions) > 1: 

852 # Pick the most popular (preferring mandatory dimensions) 

853 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

854 if requiredButMissing: 

855 candidateDimensions = requiredButMissing 

856 else: 

857 candidateDimensions = assignedDimensions 

858 

859 # If this is a choice between visit and exposure and 

860 # neither was a required part of the dataset type, 

861 # (hence in this branch) always prefer exposure over 

862 # visit since exposures are always defined and visits 

863 # are defined from exposures. 

864 if candidateDimensions == {"exposure", "visit"}: 

865 candidateDimensions = {"exposure"} 

866 

867 # Select the relevant items and get a new restricted 

868 # counter. 

869 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

870 duplicatesCounter: Counter[str] = Counter() 

871 duplicatesCounter.update(theseCounts) 

872 

873 # Choose the most common. If they are equally common 

874 # we will pick the one that was found first. 

875 # Returns a list of tuples 

876 selected = duplicatesCounter.most_common(1)[0][0] 

877 

878 log.debug( 

879 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

880 " Removed ambiguity by choosing dimension %s.", 

881 fieldName, 

882 ", ".join(assignedDimensions), 

883 selected, 

884 ) 

885 

886 for candidateDimension in assignedDimensions: 

887 if candidateDimension != selected: 

888 del guessedAssociation[candidateDimension][fieldName] 

889 

890 # Update the record look up dict with the new associations 

891 for dimensionName, values in guessedAssociation.items(): 

892 if values: # A dict might now be empty 

893 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

894 byRecord[dimensionName].update(values) 

895 

896 if byRecord: 

897 # Some record specifiers were found so we need to convert 

898 # them to the Id form 

899 for dimensionName, values in byRecord.items(): 

900 if dimensionName in newDataId: 

901 log.debug( 

902 "DataId specified explicit %s dimension value of %s in addition to" 

903 " general record specifiers for it of %s. Ignoring record information.", 

904 dimensionName, 

905 newDataId[dimensionName], 

906 str(values), 

907 ) 

908 # Get the actual record and compare with these values. 

909 try: 

910 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

911 except DataIdError: 

912 raise ValueError( 

913 f"Could not find dimension '{dimensionName}'" 

914 f" with dataId {newDataId} as part of comparing with" 

915 f" record values {byRecord[dimensionName]}" 

916 ) from None 

917 if len(recs) == 1: 

918 errmsg: List[str] = [] 

919 for k, v in values.items(): 

920 if (recval := getattr(recs[0], k)) != v: 

921 errmsg.append(f"{k}({recval} != {v})") 

922 if errmsg: 

923 raise ValueError( 

924 f"Dimension {dimensionName} in dataId has explicit value" 

925 " inconsistent with records: " + ", ".join(errmsg) 

926 ) 

927 else: 

928 # Multiple matches for an explicit dimension 

929 # should never happen but let downstream complain. 

930 pass 

931 continue 

932 

933 # Build up a WHERE expression 

934 bind = {k: v for k, v in values.items()} 

935 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

936 

937 # Hopefully we get a single record that matches 

938 records = set( 

939 self.registry.queryDimensionRecords( 

940 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

941 ) 

942 ) 

943 

944 if len(records) != 1: 

945 if len(records) > 1: 

946 # visit can have an ambiguous answer without involving 

947 # visit_system. The default visit_system is defined 

948 # by the instrument. 

949 if ( 

950 dimensionName == "visit" 

951 and "visit_system_membership" in self.registry.dimensions 

952 and "visit_system" in self.registry.dimensions["instrument"].metadata 

953 ): 

954 instrument_records = list( 

955 self.registry.queryDimensionRecords( 

956 "instrument", 

957 dataId=newDataId, 

958 **kwargs, 

959 ) 

960 ) 

961 if len(instrument_records) == 1: 

962 visit_system = instrument_records[0].visit_system 

963 if visit_system is None: 

964 # Set to a value that will never match. 

965 visit_system = -1 

966 

967 # Look up each visit in the 

968 # visit_system_membership records. 

969 for rec in records: 

970 membership = list( 

971 self.registry.queryDimensionRecords( 

972 # Use bind to allow zero results. 

973 # This is a fully-specified query. 

974 "visit_system_membership", 

975 where="instrument = inst AND visit_system = system AND visit = v", 

976 bind=dict( 

977 inst=instrument_records[0].name, system=visit_system, v=rec.id 

978 ), 

979 ) 

980 ) 

981 if membership: 

982 # This record is the right answer. 

983 records = set([rec]) 

984 break 

985 

986 # The ambiguity may have been resolved so check again. 

987 if len(records) > 1: 

988 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

989 for r in records: 

990 log.debug("- %s", str(r)) 

991 raise ValueError( 

992 f"DataId specification for dimension {dimensionName} is not" 

993 f" uniquely constrained to a single dataset by {values}." 

994 f" Got {len(records)} results." 

995 ) 

996 else: 

997 raise ValueError( 

998 f"DataId specification for dimension {dimensionName} matched no" 

999 f" records when constrained by {values}" 

1000 ) 

1001 

1002 # Get the primary key from the real dimension object 

1003 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1004 if not isinstance(dimension, Dimension): 

1005 raise RuntimeError( 

1006 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1007 ) 

1008 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1009 

1010 return newDataId, kwargs 

1011 

1012 def _findDatasetRef( 

1013 self, 

1014 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1015 dataId: Optional[DataId] = None, 

1016 *, 

1017 collections: Any = None, 

1018 allowUnresolved: bool = False, 

1019 **kwargs: Any, 

1020 ) -> DatasetRef: 

1021 """Shared logic for methods that start with a search for a dataset in 

1022 the registry. 

1023 

1024 Parameters 

1025 ---------- 

1026 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1027 When `DatasetRef` the `dataId` should be `None`. 

1028 Otherwise the `DatasetType` or name thereof. 

1029 dataId : `dict` or `DataCoordinate`, optional 

1030 A `dict` of `Dimension` link name, value pairs that label the 

1031 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1032 should be provided as the first argument. 

1033 collections : Any, optional 

1034 Collections to be searched, overriding ``self.collections``. 

1035 Can be any of the types supported by the ``collections`` argument 

1036 to butler construction. 

1037 allowUnresolved : `bool`, optional 

1038 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1039 one in the `Registry` fails. Defaults to `False`. 

1040 **kwargs 

1041 Additional keyword arguments used to augment or construct a 

1042 `DataId`. See `DataId` parameters. 

1043 

1044 Returns 

1045 ------- 

1046 ref : `DatasetRef` 

1047 A reference to the dataset identified by the given arguments. 

1048 This can be the same dataset reference as given if it was 

1049 resolved. 

1050 

1051 Raises 

1052 ------ 

1053 LookupError 

1054 Raised if no matching dataset exists in the `Registry` (and 

1055 ``allowUnresolved is False``). 

1056 ValueError 

1057 Raised if a resolved `DatasetRef` was passed as an input, but it 

1058 differs from the one found in the registry. 

1059 TypeError 

1060 Raised if no collections were provided. 

1061 """ 

1062 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1063 if isinstance(datasetRefOrType, DatasetRef): 

1064 idNumber = datasetRefOrType.id 

1065 # This is a resolved ref, return it immediately. 

1066 if idNumber: 

1067 return datasetRefOrType 

1068 else: 

1069 idNumber = None 

1070 timespan: Optional[Timespan] = None 

1071 

1072 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1073 

1074 if datasetType.isCalibration(): 

1075 # Because this is a calibration dataset, first try to make a 

1076 # standardize the data ID without restricting the dimensions to 

1077 # those of the dataset type requested, because there may be extra 

1078 # dimensions that provide temporal information for a validity-range 

1079 # lookup. 

1080 dataId = DataCoordinate.standardize( 

1081 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1082 ) 

1083 if dataId.graph.temporal: 

1084 dataId = self.registry.expandDataId(dataId) 

1085 timespan = dataId.timespan 

1086 else: 

1087 # Standardize the data ID to just the dimensions of the dataset 

1088 # type instead of letting registry.findDataset do it, so we get the 

1089 # result even if no dataset is found. 

1090 dataId = DataCoordinate.standardize( 

1091 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1092 ) 

1093 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1094 # present in the current collection. 

1095 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1096 if ref is None: 

1097 if allowUnresolved: 

1098 with warnings.catch_warnings(): 

1099 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

1100 return DatasetRef(datasetType, dataId) 

1101 else: 

1102 if collections is None: 

1103 collections = self.registry.defaults.collections 

1104 raise LookupError( 

1105 f"Dataset {datasetType.name} with data ID {dataId} " 

1106 f"could not be found in collections {collections}." 

1107 ) 

1108 if idNumber is not None and idNumber != ref.id: 

1109 if collections is None: 

1110 collections = self.registry.defaults.collections 

1111 raise ValueError( 

1112 f"DatasetRef.id provided ({idNumber}) does not match " 

1113 f"id ({ref.id}) in registry in collections {collections}." 

1114 ) 

1115 if datasetType != ref.datasetType: 

1116 # If they differ it is because the user explicitly specified 

1117 # a compatible dataset type to this call rather than using the 

1118 # registry definition. The DatasetRef must therefore be recreated 

1119 # using the user definition such that the expected type is 

1120 # returned. 

1121 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1122 

1123 return ref 

1124 

1125 @transactional 

1126 @deprecated( 

1127 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1128 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1129 " were relying on the run parameter to determine the run." 

1130 " Will be removed after v27.0.", 

1131 version="v26.0", 

1132 category=FutureWarning, 

1133 ) 

1134 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1135 # Docstring inherited. 

1136 return self.put(obj, ref) 

1137 

1138 @transactional 

1139 def put( 

1140 self, 

1141 obj: Any, 

1142 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1143 /, 

1144 dataId: Optional[DataId] = None, 

1145 *, 

1146 run: Optional[str] = None, 

1147 **kwargs: Any, 

1148 ) -> DatasetRef: 

1149 """Store and register a dataset. 

1150 

1151 Parameters 

1152 ---------- 

1153 obj : `object` 

1154 The dataset. 

1155 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1156 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1157 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1158 `DatasetRef` is given the run and ID are used directly. 

1159 dataId : `dict` or `DataCoordinate` 

1160 A `dict` of `Dimension` link name, value pairs that label the 

1161 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1162 should be provided as the second argument. 

1163 run : `str`, optional 

1164 The name of the run the dataset should be added to, overriding 

1165 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1166 **kwargs 

1167 Additional keyword arguments used to augment or construct a 

1168 `DataCoordinate`. See `DataCoordinate.standardize` 

1169 parameters. Not used if a resolve `DatasetRef` is provided. 

1170 

1171 Returns 

1172 ------- 

1173 ref : `DatasetRef` 

1174 A reference to the stored dataset, updated with the correct id if 

1175 given. 

1176 

1177 Raises 

1178 ------ 

1179 TypeError 

1180 Raised if the butler is read-only or if no run has been provided. 

1181 """ 

1182 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1183 # This is a direct put of predefined DatasetRef. 

1184 log.debug("Butler put direct: %s", datasetRefOrType) 

1185 (imported_ref,) = self.registry._importDatasets( 

1186 [datasetRefOrType], 

1187 expand=True, 

1188 ) 

1189 if imported_ref.id != datasetRefOrType.getCheckedId(): 

1190 raise RuntimeError("This registry configuration does not support direct put of ref.") 

1191 self.datastore.put(obj, datasetRefOrType) 

1192 return datasetRefOrType 

1193 

1194 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1195 if not self.isWriteable(): 

1196 raise TypeError("Butler is read-only.") 

1197 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1198 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1199 raise ValueError("DatasetRef must not be in registry, must have None id") 

1200 

1201 # Handle dimension records in dataId 

1202 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1203 

1204 # Add Registry Dataset entry. 

1205 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1206 

1207 # For an execution butler the datasets will be pre-defined. 

1208 # If the butler is configured that way datasets should only be inserted 

1209 # if they do not already exist in registry. Trying and catching 

1210 # ConflictingDefinitionError will not work because the transaction 

1211 # will be corrupted. Instead, in this mode always check first. 

1212 ref = None 

1213 ref_is_predefined = False 

1214 if self._allow_put_of_predefined_dataset: 

1215 # Get the matching ref for this run. 

1216 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1217 

1218 if ref: 

1219 # Must be expanded form for datastore templating 

1220 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1221 ref = ref.expanded(dataId) 

1222 ref_is_predefined = True 

1223 

1224 if not ref: 

1225 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1226 

1227 # If the ref is predefined it is possible that the datastore also 

1228 # has the record. Asking datastore to put it again will result in 

1229 # the artifact being recreated, overwriting previous, then will cause 

1230 # a failure in writing the record which will cause the artifact 

1231 # to be removed. Much safer to ask first before attempting to 

1232 # overwrite. Race conditions should not be an issue for the 

1233 # execution butler environment. 

1234 if ref_is_predefined: 

1235 if self.datastore.knows(ref): 

1236 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1237 

1238 self.datastore.put(obj, ref) 

1239 

1240 return ref 

1241 

1242 @deprecated( 

1243 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1244 " Please use Butler.get(). Will be removed after v27.0.", 

1245 version="v26.0", 

1246 category=FutureWarning, 

1247 ) 

1248 def getDirect( 

1249 self, 

1250 ref: DatasetRef, 

1251 *, 

1252 parameters: Optional[Dict[str, Any]] = None, 

1253 storageClass: Optional[Union[StorageClass, str]] = None, 

1254 ) -> Any: 

1255 """Retrieve a stored dataset. 

1256 

1257 Parameters 

1258 ---------- 

1259 ref : `DatasetRef` 

1260 Resolved reference to an already stored dataset. 

1261 parameters : `dict` 

1262 Additional StorageClass-defined options to control reading, 

1263 typically used to efficiently read only a subset of the dataset. 

1264 storageClass : `StorageClass` or `str`, optional 

1265 The storage class to be used to override the Python type 

1266 returned by this method. By default the returned type matches 

1267 the dataset type definition for this dataset. Specifying a 

1268 read `StorageClass` can force a different type to be returned. 

1269 This type must be compatible with the original type. 

1270 

1271 Returns 

1272 ------- 

1273 obj : `object` 

1274 The dataset. 

1275 """ 

1276 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1277 

1278 @deprecated( 

1279 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1280 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1281 version="v26.0", 

1282 category=FutureWarning, 

1283 ) 

1284 def getDirectDeferred( 

1285 self, 

1286 ref: DatasetRef, 

1287 *, 

1288 parameters: Union[dict, None] = None, 

1289 storageClass: str | StorageClass | None = None, 

1290 ) -> DeferredDatasetHandle: 

1291 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1292 from a resolved `DatasetRef`. 

1293 

1294 Parameters 

1295 ---------- 

1296 ref : `DatasetRef` 

1297 Resolved reference to an already stored dataset. 

1298 parameters : `dict` 

1299 Additional StorageClass-defined options to control reading, 

1300 typically used to efficiently read only a subset of the dataset. 

1301 storageClass : `StorageClass` or `str`, optional 

1302 The storage class to be used to override the Python type 

1303 returned by this method. By default the returned type matches 

1304 the dataset type definition for this dataset. Specifying a 

1305 read `StorageClass` can force a different type to be returned. 

1306 This type must be compatible with the original type. 

1307 

1308 Returns 

1309 ------- 

1310 obj : `DeferredDatasetHandle` 

1311 A handle which can be used to retrieve a dataset at a later time. 

1312 

1313 Raises 

1314 ------ 

1315 AmbiguousDatasetError 

1316 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1317 """ 

1318 if ref.id is None: 

1319 raise AmbiguousDatasetError( 

1320 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1321 ) 

1322 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1323 

1324 def getDeferred( 

1325 self, 

1326 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1327 /, 

1328 dataId: Optional[DataId] = None, 

1329 *, 

1330 parameters: Union[dict, None] = None, 

1331 collections: Any = None, 

1332 storageClass: str | StorageClass | None = None, 

1333 **kwargs: Any, 

1334 ) -> DeferredDatasetHandle: 

1335 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1336 after an immediate registry lookup. 

1337 

1338 Parameters 

1339 ---------- 

1340 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1341 When `DatasetRef` the `dataId` should be `None`. 

1342 Otherwise the `DatasetType` or name thereof. 

1343 dataId : `dict` or `DataCoordinate`, optional 

1344 A `dict` of `Dimension` link name, value pairs that label the 

1345 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1346 should be provided as the first argument. 

1347 parameters : `dict` 

1348 Additional StorageClass-defined options to control reading, 

1349 typically used to efficiently read only a subset of the dataset. 

1350 collections : Any, optional 

1351 Collections to be searched, overriding ``self.collections``. 

1352 Can be any of the types supported by the ``collections`` argument 

1353 to butler construction. 

1354 storageClass : `StorageClass` or `str`, optional 

1355 The storage class to be used to override the Python type 

1356 returned by this method. By default the returned type matches 

1357 the dataset type definition for this dataset. Specifying a 

1358 read `StorageClass` can force a different type to be returned. 

1359 This type must be compatible with the original type. 

1360 **kwargs 

1361 Additional keyword arguments used to augment or construct a 

1362 `DataId`. See `DataId` parameters. 

1363 

1364 Returns 

1365 ------- 

1366 obj : `DeferredDatasetHandle` 

1367 A handle which can be used to retrieve a dataset at a later time. 

1368 

1369 Raises 

1370 ------ 

1371 LookupError 

1372 Raised if no matching dataset exists in the `Registry` (and 

1373 ``allowUnresolved is False``). 

1374 ValueError 

1375 Raised if a resolved `DatasetRef` was passed as an input, but it 

1376 differs from the one found in the registry. 

1377 TypeError 

1378 Raised if no collections were provided. 

1379 """ 

1380 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1381 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1382 

1383 def get( 

1384 self, 

1385 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1386 /, 

1387 dataId: Optional[DataId] = None, 

1388 *, 

1389 parameters: Optional[Dict[str, Any]] = None, 

1390 collections: Any = None, 

1391 storageClass: Optional[Union[StorageClass, str]] = None, 

1392 **kwargs: Any, 

1393 ) -> Any: 

1394 """Retrieve a stored dataset. 

1395 

1396 Parameters 

1397 ---------- 

1398 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1399 When `DatasetRef` the `dataId` should be `None`. 

1400 Otherwise the `DatasetType` or name thereof. 

1401 If a resolved `DatasetRef`, the associated dataset 

1402 is returned directly without additional querying. 

1403 dataId : `dict` or `DataCoordinate` 

1404 A `dict` of `Dimension` link name, value pairs that label the 

1405 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1406 should be provided as the first argument. 

1407 parameters : `dict` 

1408 Additional StorageClass-defined options to control reading, 

1409 typically used to efficiently read only a subset of the dataset. 

1410 collections : Any, optional 

1411 Collections to be searched, overriding ``self.collections``. 

1412 Can be any of the types supported by the ``collections`` argument 

1413 to butler construction. 

1414 storageClass : `StorageClass` or `str`, optional 

1415 The storage class to be used to override the Python type 

1416 returned by this method. By default the returned type matches 

1417 the dataset type definition for this dataset. Specifying a 

1418 read `StorageClass` can force a different type to be returned. 

1419 This type must be compatible with the original type. 

1420 **kwargs 

1421 Additional keyword arguments used to augment or construct a 

1422 `DataCoordinate`. See `DataCoordinate.standardize` 

1423 parameters. 

1424 

1425 Returns 

1426 ------- 

1427 obj : `object` 

1428 The dataset. 

1429 

1430 Raises 

1431 ------ 

1432 LookupError 

1433 Raised if no matching dataset exists in the `Registry`. 

1434 TypeError 

1435 Raised if no collections were provided. 

1436 

1437 Notes 

1438 ----- 

1439 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1440 this method requires that the given data ID include temporal dimensions 

1441 beyond the dimensions of the dataset type itself, in order to find the 

1442 dataset with the appropriate validity range. For example, a "bias" 

1443 dataset with native dimensions ``{instrument, detector}`` could be 

1444 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1445 ``exposure`` is a temporal dimension. 

1446 """ 

1447 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1448 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1449 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1450 

1451 def getURIs( 

1452 self, 

1453 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1454 /, 

1455 dataId: Optional[DataId] = None, 

1456 *, 

1457 predict: bool = False, 

1458 collections: Any = None, 

1459 run: Optional[str] = None, 

1460 **kwargs: Any, 

1461 ) -> DatasetRefURIs: 

1462 """Returns the URIs associated with the dataset. 

1463 

1464 Parameters 

1465 ---------- 

1466 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1467 When `DatasetRef` the `dataId` should be `None`. 

1468 Otherwise the `DatasetType` or name thereof. 

1469 dataId : `dict` or `DataCoordinate` 

1470 A `dict` of `Dimension` link name, value pairs that label the 

1471 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1472 should be provided as the first argument. 

1473 predict : `bool` 

1474 If `True`, allow URIs to be returned of datasets that have not 

1475 been written. 

1476 collections : Any, optional 

1477 Collections to be searched, overriding ``self.collections``. 

1478 Can be any of the types supported by the ``collections`` argument 

1479 to butler construction. 

1480 run : `str`, optional 

1481 Run to use for predictions, overriding ``self.run``. 

1482 **kwargs 

1483 Additional keyword arguments used to augment or construct a 

1484 `DataCoordinate`. See `DataCoordinate.standardize` 

1485 parameters. 

1486 

1487 Returns 

1488 ------- 

1489 uris : `DatasetRefURIs` 

1490 The URI to the primary artifact associated with this dataset (if 

1491 the dataset was disassembled within the datastore this may be 

1492 `None`), and the URIs to any components associated with the dataset 

1493 artifact. (can be empty if there are no components). 

1494 """ 

1495 ref = self._findDatasetRef( 

1496 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1497 ) 

1498 if ref.id is None: # only possible if predict is True 

1499 if run is None: 

1500 run = self.run 

1501 if run is None: 

1502 raise TypeError("Cannot predict location with run=None.") 

1503 # Lie about ID, because we can't guess it, and only 

1504 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1505 with warnings.catch_warnings(): 

1506 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

1507 ref = ref.resolved(id=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), run=run) 

1508 return self.datastore.getURIs(ref, predict) 

1509 

1510 def getURI( 

1511 self, 

1512 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1513 /, 

1514 dataId: Optional[DataId] = None, 

1515 *, 

1516 predict: bool = False, 

1517 collections: Any = None, 

1518 run: Optional[str] = None, 

1519 **kwargs: Any, 

1520 ) -> ResourcePath: 

1521 """Return the URI to the Dataset. 

1522 

1523 Parameters 

1524 ---------- 

1525 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1526 When `DatasetRef` the `dataId` should be `None`. 

1527 Otherwise the `DatasetType` or name thereof. 

1528 dataId : `dict` or `DataCoordinate` 

1529 A `dict` of `Dimension` link name, value pairs that label the 

1530 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1531 should be provided as the first argument. 

1532 predict : `bool` 

1533 If `True`, allow URIs to be returned of datasets that have not 

1534 been written. 

1535 collections : Any, optional 

1536 Collections to be searched, overriding ``self.collections``. 

1537 Can be any of the types supported by the ``collections`` argument 

1538 to butler construction. 

1539 run : `str`, optional 

1540 Run to use for predictions, overriding ``self.run``. 

1541 **kwargs 

1542 Additional keyword arguments used to augment or construct a 

1543 `DataCoordinate`. See `DataCoordinate.standardize` 

1544 parameters. 

1545 

1546 Returns 

1547 ------- 

1548 uri : `lsst.resources.ResourcePath` 

1549 URI pointing to the Dataset within the datastore. If the 

1550 Dataset does not exist in the datastore, and if ``predict`` is 

1551 `True`, the URI will be a prediction and will include a URI 

1552 fragment "#predicted". 

1553 If the datastore does not have entities that relate well 

1554 to the concept of a URI the returned URI string will be 

1555 descriptive. The returned URI is not guaranteed to be obtainable. 

1556 

1557 Raises 

1558 ------ 

1559 LookupError 

1560 A URI has been requested for a dataset that does not exist and 

1561 guessing is not allowed. 

1562 ValueError 

1563 Raised if a resolved `DatasetRef` was passed as an input, but it 

1564 differs from the one found in the registry. 

1565 TypeError 

1566 Raised if no collections were provided. 

1567 RuntimeError 

1568 Raised if a URI is requested for a dataset that consists of 

1569 multiple artifacts. 

1570 """ 

1571 primary, components = self.getURIs( 

1572 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1573 ) 

1574 

1575 if primary is None or components: 

1576 raise RuntimeError( 

1577 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1578 "Use Butler.getURIs() instead." 

1579 ) 

1580 return primary 

1581 

1582 def retrieveArtifacts( 

1583 self, 

1584 refs: Iterable[DatasetRef], 

1585 destination: ResourcePathExpression, 

1586 transfer: str = "auto", 

1587 preserve_path: bool = True, 

1588 overwrite: bool = False, 

1589 ) -> List[ResourcePath]: 

1590 """Retrieve the artifacts associated with the supplied refs. 

1591 

1592 Parameters 

1593 ---------- 

1594 refs : iterable of `DatasetRef` 

1595 The datasets for which artifacts are to be retrieved. 

1596 A single ref can result in multiple artifacts. The refs must 

1597 be resolved. 

1598 destination : `lsst.resources.ResourcePath` or `str` 

1599 Location to write the artifacts. 

1600 transfer : `str`, optional 

1601 Method to use to transfer the artifacts. Must be one of the options 

1602 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1603 "move" is not allowed. 

1604 preserve_path : `bool`, optional 

1605 If `True` the full path of the artifact within the datastore 

1606 is preserved. If `False` the final file component of the path 

1607 is used. 

1608 overwrite : `bool`, optional 

1609 If `True` allow transfers to overwrite existing files at the 

1610 destination. 

1611 

1612 Returns 

1613 ------- 

1614 targets : `list` of `lsst.resources.ResourcePath` 

1615 URIs of file artifacts in destination location. Order is not 

1616 preserved. 

1617 

1618 Notes 

1619 ----- 

1620 For non-file datastores the artifacts written to the destination 

1621 may not match the representation inside the datastore. For example 

1622 a hierarchical data structure in a NoSQL database may well be stored 

1623 as a JSON file. 

1624 """ 

1625 return self.datastore.retrieveArtifacts( 

1626 refs, 

1627 ResourcePath(destination), 

1628 transfer=transfer, 

1629 preserve_path=preserve_path, 

1630 overwrite=overwrite, 

1631 ) 

1632 

1633 def datasetExists( 

1634 self, 

1635 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1636 dataId: Optional[DataId] = None, 

1637 *, 

1638 collections: Any = None, 

1639 **kwargs: Any, 

1640 ) -> bool: 

1641 """Return True if the Dataset is actually present in the Datastore. 

1642 

1643 Parameters 

1644 ---------- 

1645 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1646 When `DatasetRef` the `dataId` should be `None`. 

1647 Otherwise the `DatasetType` or name thereof. 

1648 dataId : `dict` or `DataCoordinate` 

1649 A `dict` of `Dimension` link name, value pairs that label the 

1650 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1651 should be provided as the first argument. 

1652 collections : Any, optional 

1653 Collections to be searched, overriding ``self.collections``. 

1654 Can be any of the types supported by the ``collections`` argument 

1655 to butler construction. 

1656 **kwargs 

1657 Additional keyword arguments used to augment or construct a 

1658 `DataCoordinate`. See `DataCoordinate.standardize` 

1659 parameters. 

1660 

1661 Raises 

1662 ------ 

1663 LookupError 

1664 Raised if the dataset is not even present in the Registry. 

1665 ValueError 

1666 Raised if a resolved `DatasetRef` was passed as an input, but it 

1667 differs from the one found in the registry. 

1668 TypeError 

1669 Raised if no collections were provided. 

1670 """ 

1671 # A resolved ref may be given that is not known to this butler. 

1672 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1673 ref = self.registry.getDataset(datasetRefOrType.id) 

1674 if ref is None: 

1675 raise LookupError( 

1676 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1677 ) 

1678 else: 

1679 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1680 return self.datastore.exists(ref) 

1681 

1682 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1683 """Remove one or more `~CollectionType.RUN` collections and the 

1684 datasets within them. 

1685 

1686 Parameters 

1687 ---------- 

1688 names : `Iterable` [ `str` ] 

1689 The names of the collections to remove. 

1690 unstore : `bool`, optional 

1691 If `True` (default), delete datasets from all datastores in which 

1692 they are present, and attempt to rollback the registry deletions if 

1693 datastore deletions fail (which may not always be possible). If 

1694 `False`, datastore records for these datasets are still removed, 

1695 but any artifacts (e.g. files) will not be. 

1696 

1697 Raises 

1698 ------ 

1699 TypeError 

1700 Raised if one or more collections are not of type 

1701 `~CollectionType.RUN`. 

1702 """ 

1703 if not self.isWriteable(): 

1704 raise TypeError("Butler is read-only.") 

1705 names = list(names) 

1706 refs: List[DatasetRef] = [] 

1707 for name in names: 

1708 collectionType = self.registry.getCollectionType(name) 

1709 if collectionType is not CollectionType.RUN: 

1710 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1711 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1712 with self.datastore.transaction(): 

1713 with self.registry.transaction(): 

1714 if unstore: 

1715 self.datastore.trash(refs) 

1716 else: 

1717 self.datastore.forget(refs) 

1718 for name in names: 

1719 self.registry.removeCollection(name) 

1720 if unstore: 

1721 # Point of no return for removing artifacts 

1722 self.datastore.emptyTrash() 

1723 

1724 def pruneDatasets( 

1725 self, 

1726 refs: Iterable[DatasetRef], 

1727 *, 

1728 disassociate: bool = True, 

1729 unstore: bool = False, 

1730 tags: Iterable[str] = (), 

1731 purge: bool = False, 

1732 ) -> None: 

1733 # docstring inherited from LimitedButler 

1734 

1735 if not self.isWriteable(): 

1736 raise TypeError("Butler is read-only.") 

1737 if purge: 

1738 if not disassociate: 

1739 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1740 if not unstore: 

1741 raise TypeError("Cannot pass purge=True without unstore=True.") 

1742 elif disassociate: 

1743 tags = tuple(tags) 

1744 if not tags: 

1745 raise TypeError("No tags provided but disassociate=True.") 

1746 for tag in tags: 

1747 collectionType = self.registry.getCollectionType(tag) 

1748 if collectionType is not CollectionType.TAGGED: 

1749 raise TypeError( 

1750 f"Cannot disassociate from collection '{tag}' " 

1751 f"of non-TAGGED type {collectionType.name}." 

1752 ) 

1753 # For an execution butler we want to keep existing UUIDs for the 

1754 # datasets, for that we need to keep them in the collections but 

1755 # remove from datastore. 

1756 if self._allow_put_of_predefined_dataset and purge: 

1757 purge = False 

1758 disassociate = False 

1759 # Transform possibly-single-pass iterable into something we can iterate 

1760 # over multiple times. 

1761 refs = list(refs) 

1762 # Pruning a component of a DatasetRef makes no sense since registry 

1763 # doesn't know about components and datastore might not store 

1764 # components in a separate file 

1765 for ref in refs: 

1766 if ref.datasetType.component(): 

1767 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1768 # We don't need an unreliable Datastore transaction for this, because 

1769 # we've been extra careful to ensure that Datastore.trash only involves 

1770 # mutating the Registry (it can _look_ at Datastore-specific things, 

1771 # but shouldn't change them), and hence all operations here are 

1772 # Registry operations. 

1773 with self.datastore.transaction(): 

1774 with self.registry.transaction(): 

1775 if unstore: 

1776 self.datastore.trash(refs) 

1777 if purge: 

1778 self.registry.removeDatasets(refs) 

1779 elif disassociate: 

1780 assert tags, "Guaranteed by earlier logic in this function." 

1781 for tag in tags: 

1782 self.registry.disassociate(tag, refs) 

1783 # We've exited the Registry transaction, and apparently committed. 

1784 # (if there was an exception, everything rolled back, and it's as if 

1785 # nothing happened - and we never get here). 

1786 # Datastore artifacts are not yet gone, but they're clearly marked 

1787 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1788 # problems we can try again later, and if manual administrative 

1789 # intervention is required, it's pretty clear what that should entail: 

1790 # deleting everything on disk and in private Datastore tables that is 

1791 # in the dataset_location_trash table. 

1792 if unstore: 

1793 # Point of no return for removing artifacts 

1794 self.datastore.emptyTrash() 

1795 

1796 @transactional 

1797 def ingest( 

1798 self, 

1799 *datasets: FileDataset, 

1800 transfer: Optional[str] = "auto", 

1801 run: Optional[str] = None, 

1802 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1803 record_validation_info: bool = True, 

1804 ) -> None: 

1805 """Store and register one or more datasets that already exist on disk. 

1806 

1807 Parameters 

1808 ---------- 

1809 datasets : `FileDataset` 

1810 Each positional argument is a struct containing information about 

1811 a file to be ingested, including its URI (either absolute or 

1812 relative to the datastore root, if applicable), a resolved 

1813 `DatasetRef`, and optionally a formatter class or its 

1814 fully-qualified string name. If a formatter is not provided, the 

1815 formatter that would be used for `put` is assumed. On successful 

1816 all `FileDataset.formatter` attributes will be set to the formatter 

1817 class used. `FileDataset.path` attributes may be modified to put 

1818 paths in whatever the datastore considers a standardized form. 

1819 transfer : `str`, optional 

1820 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1821 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1822 transfer the file. 

1823 run : `str`, optional 

1824 The name of the run ingested datasets should be added to, 

1825 overriding ``self.run``. This parameter is now deprecated since 

1826 the run is encoded in the ``FileDataset``. 

1827 idGenerationMode : `DatasetIdGenEnum`, optional 

1828 Specifies option for generating dataset IDs. By default unique IDs 

1829 are generated for each inserted dataset. 

1830 record_validation_info : `bool`, optional 

1831 If `True`, the default, the datastore can record validation 

1832 information associated with the file. If `False` the datastore 

1833 will not attempt to track any information such as checksums 

1834 or file sizes. This can be useful if such information is tracked 

1835 in an external system or if the file is to be compressed in place. 

1836 It is up to the datastore whether this parameter is relevant. 

1837 

1838 Raises 

1839 ------ 

1840 TypeError 

1841 Raised if the butler is read-only or if no run was provided. 

1842 NotImplementedError 

1843 Raised if the `Datastore` does not support the given transfer mode. 

1844 DatasetTypeNotSupportedError 

1845 Raised if one or more files to be ingested have a dataset type that 

1846 is not supported by the `Datastore`.. 

1847 FileNotFoundError 

1848 Raised if one of the given files does not exist. 

1849 FileExistsError 

1850 Raised if transfer is not `None` but the (internal) location the 

1851 file would be moved to is already occupied. 

1852 

1853 Notes 

1854 ----- 

1855 This operation is not fully exception safe: if a database operation 

1856 fails, the given `FileDataset` instances may be only partially updated. 

1857 

1858 It is atomic in terms of database operations (they will either all 

1859 succeed or all fail) providing the database engine implements 

1860 transactions correctly. It will attempt to be atomic in terms of 

1861 filesystem operations as well, but this cannot be implemented 

1862 rigorously for most datastores. 

1863 """ 

1864 if not self.isWriteable(): 

1865 raise TypeError("Butler is read-only.") 

1866 

1867 log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1868 if not datasets: 

1869 return 

1870 

1871 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1872 

1873 # We need to reorganize all the inputs so that they are grouped 

1874 # by dataset type and run. Multiple refs in a single FileDataset 

1875 # are required to share the run and dataset type. 

1876 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

1877 groupedData: GroupedData = defaultdict(list) 

1878 

1879 # Track DataIDs that are being ingested so we can spot issues early 

1880 # with duplication. Retain previous FileDataset so we can report it. 

1881 groupedDataIds: MutableMapping[ 

1882 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1883 ] = defaultdict(dict) 

1884 

1885 logged_resolving = False 

1886 used_run = False 

1887 default_run = run or self.run 

1888 

1889 # And the nested loop that populates it: 

1890 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1891 # Somewhere to store pre-existing refs if we have an 

1892 # execution butler. 

1893 existingRefs: List[DatasetRef] = [] 

1894 

1895 # Any newly-resolved refs. 

1896 resolvedRefs: list[DatasetRef] = [] 

1897 

1898 dataset_run: str | None = None 

1899 for ref in dataset.refs: 

1900 if ref.id is None: 

1901 # Eventually this will be impossible. For now we must 

1902 # resolve this ref. 

1903 if default_run is None: 

1904 raise ValueError("Unresolved DatasetRef used for ingest but no run specified.") 

1905 expanded_dataId = self.registry.expandDataId(ref.dataId) 

1906 if not logged_resolving: 

1907 log.info("ingest() given unresolved refs. Resolving them into run %r", default_run) 

1908 logged_resolving = True 

1909 resolved = DatasetIdFactory().resolveRef(ref, default_run, idGenerationMode) 

1910 ref = resolved.expanded(expanded_dataId) 

1911 resolvedRefs.append(ref) 

1912 used_run = True 

1913 

1914 if dataset_run is None: 

1915 dataset_run = ref.run 

1916 elif dataset_run != ref.run: 

1917 raise ConflictingDefinitionError( 

1918 f"Refs in {dataset} have different runs and we currently require one run per file." 

1919 ) 

1920 

1921 assert ref.run is not None # For mypy 

1922 group_key = (ref.datasetType, ref.run) 

1923 

1924 if ref.dataId in groupedDataIds[group_key]: 

1925 raise ConflictingDefinitionError( 

1926 f"Ingest conflict. Dataset {dataset.path} has same" 

1927 " DataId as other ingest dataset" 

1928 f" {groupedDataIds[group_key][ref.dataId].path} " 

1929 f" ({ref.dataId})" 

1930 ) 

1931 if self._allow_put_of_predefined_dataset: 

1932 existing_ref = self.registry.findDataset( 

1933 ref.datasetType, dataId=ref.dataId, collections=ref.run 

1934 ) 

1935 if existing_ref: 

1936 if existing_ref.id != ref.id: 

1937 raise ConflictingDefinitionError( 

1938 f"Registry has registered dataset {existing_ref!r} which has differing ID " 

1939 f"from that being ingested ({ref!r})." 

1940 ) 

1941 if self.datastore.knows(existing_ref): 

1942 raise ConflictingDefinitionError( 

1943 f"Dataset associated with path {dataset.path}" 

1944 f" already exists as {existing_ref}." 

1945 ) 

1946 # Datastore will need expanded data coordinate 

1947 # so this has to be attached to the FileDataset 

1948 # if necessary. 

1949 if not ref.dataId.hasRecords(): 

1950 expanded_dataId = self.registry.expandDataId(ref.dataId) 

1951 existing_ref = existing_ref.expanded(expanded_dataId) 

1952 else: 

1953 # Both refs are identical but we want to 

1954 # keep the expanded one. 

1955 existing_ref = ref 

1956 

1957 # Store this ref elsewhere since it already exists 

1958 # and we do not want to remake it but we do want 

1959 # to store it in the datastore. 

1960 existingRefs.append(existing_ref) 

1961 

1962 # Nothing else to do until we have finished 

1963 # iterating. 

1964 continue 

1965 

1966 groupedDataIds[group_key][ref.dataId] = dataset 

1967 

1968 if existingRefs: 

1969 if len(dataset.refs) != len(existingRefs): 

1970 # Keeping track of partially pre-existing datasets is hard 

1971 # and should generally never happen. For now don't allow 

1972 # it. 

1973 raise ConflictingDefinitionError( 

1974 f"For dataset {dataset.path} some dataIds already exist" 

1975 " in registry but others do not. This is not supported." 

1976 ) 

1977 

1978 # Store expanded form in the original FileDataset. 

1979 dataset.refs = existingRefs 

1980 elif resolvedRefs: 

1981 if len(dataset.refs) != len(resolvedRefs): 

1982 raise ConflictingDefinitionError( 

1983 f"For dataset {dataset.path} some DatasetRef were " 

1984 "resolved and others were not. This is not supported." 

1985 ) 

1986 dataset.refs = resolvedRefs 

1987 

1988 # These datasets have to be registered. 

1989 self.registry._importDatasets(resolvedRefs) 

1990 else: 

1991 groupedData[group_key].append(dataset) 

1992 

1993 if not used_run and run is not None: 

1994 warnings.warn( 

1995 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

1996 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

1997 category=FutureWarning, 

1998 stacklevel=3, # Take into account the @transactional decorator. 

1999 ) 

2000 

2001 # Now we can bulk-insert into Registry for each DatasetType. 

2002 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

2003 groupedData.items(), desc="Bulk-inserting datasets by type" 

2004 ): 

2005 refs_to_import = [] 

2006 for dataset in grouped_datasets: 

2007 refs_to_import.extend(dataset.refs) 

2008 

2009 n_refs = len(refs_to_import) 

2010 log.verbose( 

2011 "Importing %d ref%s of dataset type %r into run %r", 

2012 n_refs, 

2013 "" if n_refs == 1 else "s", 

2014 datasetType.name, 

2015 this_run, 

2016 ) 

2017 

2018 # Import the refs and expand the DataCoordinates since we can't 

2019 # guarantee that they are expanded and Datastore will need 

2020 # the records. 

2021 imported_refs = self.registry._importDatasets(refs_to_import, expand=True) 

2022 assert set(imported_refs) == set(refs_to_import) 

2023 

2024 # Replace all the refs in the FileDataset with expanded versions. 

2025 for dataset in grouped_datasets: 

2026 new_refs = [imported_refs.pop(0) for _ in dataset.refs] 

2027 dataset.refs = new_refs 

2028 

2029 # Bulk-insert everything into Datastore. 

2030 # We do not know if any of the registry entries already existed 

2031 # (_importDatasets only complains if they exist but differ) so 

2032 # we have to catch IntegrityError explicitly. 

2033 try: 

2034 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

2035 except IntegrityError as e: 

2036 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") 

2037 

2038 @contextlib.contextmanager 

2039 def export( 

2040 self, 

2041 *, 

2042 directory: Optional[str] = None, 

2043 filename: Optional[str] = None, 

2044 format: Optional[str] = None, 

2045 transfer: Optional[str] = None, 

2046 ) -> Iterator[RepoExportContext]: 

2047 """Export datasets from the repository represented by this `Butler`. 

2048 

2049 This method is a context manager that returns a helper object 

2050 (`RepoExportContext`) that is used to indicate what information from 

2051 the repository should be exported. 

2052 

2053 Parameters 

2054 ---------- 

2055 directory : `str`, optional 

2056 Directory dataset files should be written to if ``transfer`` is not 

2057 `None`. 

2058 filename : `str`, optional 

2059 Name for the file that will include database information associated 

2060 with the exported datasets. If this is not an absolute path and 

2061 ``directory`` is not `None`, it will be written to ``directory`` 

2062 instead of the current working directory. Defaults to 

2063 "export.{format}". 

2064 format : `str`, optional 

2065 File format for the database information file. If `None`, the 

2066 extension of ``filename`` will be used. 

2067 transfer : `str`, optional 

2068 Transfer mode passed to `Datastore.export`. 

2069 

2070 Raises 

2071 ------ 

2072 TypeError 

2073 Raised if the set of arguments passed is inconsistent. 

2074 

2075 Examples 

2076 -------- 

2077 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

2078 methods are used to provide the iterables over data IDs and/or datasets 

2079 to be exported:: 

2080 

2081 with butler.export("exports.yaml") as export: 

2082 # Export all flats, but none of the dimension element rows 

2083 # (i.e. data ID information) associated with them. 

2084 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2085 elements=()) 

2086 # Export all datasets that start with "deepCoadd_" and all of 

2087 # their associated data ID information. 

2088 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2089 """ 

2090 if directory is None and transfer is not None: 

2091 raise TypeError("Cannot transfer without providing a directory.") 

2092 if transfer == "move": 

2093 raise TypeError("Transfer may not be 'move': export is read-only") 

2094 if format is None: 

2095 if filename is None: 

2096 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2097 else: 

2098 _, format = os.path.splitext(filename) 

2099 if not format: 

2100 raise ValueError("Please specify a file extension to determine export format.") 

2101 format = format[1:] # Strip leading "."" 

2102 elif filename is None: 

2103 filename = f"export.{format}" 

2104 if directory is not None: 

2105 filename = os.path.join(directory, filename) 

2106 formats = self._config["repo_transfer_formats"] 

2107 if format not in formats: 

2108 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2109 BackendClass = get_class_of(formats[format, "export"]) 

2110 with open(filename, "w") as stream: 

2111 backend = BackendClass(stream, universe=self.registry.dimensions) 

2112 try: 

2113 helper = RepoExportContext( 

2114 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2115 ) 

2116 yield helper 

2117 except BaseException: 

2118 raise 

2119 else: 

2120 helper._finish() 

2121 

2122 def import_( 

2123 self, 

2124 *, 

2125 directory: Optional[ResourcePathExpression] = None, 

2126 filename: Union[ResourcePathExpression, TextIO, None] = None, 

2127 format: Optional[str] = None, 

2128 transfer: Optional[str] = None, 

2129 skip_dimensions: Optional[Set] = None, 

2130 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2131 reuseIds: bool = False, 

2132 ) -> None: 

2133 """Import datasets into this repository that were exported from a 

2134 different butler repository via `~lsst.daf.butler.Butler.export`. 

2135 

2136 Parameters 

2137 ---------- 

2138 directory : `~lsst.resources.ResourcePathExpression`, optional 

2139 Directory containing dataset files to import from. If `None`, 

2140 ``filename`` and all dataset file paths specified therein must 

2141 be absolute. 

2142 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2143 A stream or name of file that contains database information 

2144 associated with the exported datasets, typically generated by 

2145 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2146 `~lsst.resources.ResourcePath` and is not an absolute path, 

2147 it will first be looked for relative to ``directory`` and if not 

2148 found there it will be looked for in the current working 

2149 directory. Defaults to "export.{format}". 

2150 format : `str`, optional 

2151 File format for ``filename``. If `None`, the extension of 

2152 ``filename`` will be used. 

2153 transfer : `str`, optional 

2154 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2155 skip_dimensions : `set`, optional 

2156 Names of dimensions that should be skipped and not imported. 

2157 idGenerationMode : `DatasetIdGenEnum`, optional 

2158 Specifies option for generating dataset IDs when IDs are not 

2159 provided or their type does not match backend type. By default 

2160 unique IDs are generated for each inserted dataset. 

2161 reuseIds : `bool`, optional 

2162 If `True` then forces re-use of imported dataset IDs for integer 

2163 IDs which are normally generated as auto-incremented; exception 

2164 will be raised if imported IDs clash with existing ones. This 

2165 option has no effect on the use of globally-unique IDs which are 

2166 always re-used (or generated if integer IDs are being imported). 

2167 

2168 Raises 

2169 ------ 

2170 TypeError 

2171 Raised if the set of arguments passed is inconsistent, or if the 

2172 butler is read-only. 

2173 """ 

2174 if not self.isWriteable(): 

2175 raise TypeError("Butler is read-only.") 

2176 if format is None: 

2177 if filename is None: 

2178 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2179 else: 

2180 _, format = os.path.splitext(filename) # type: ignore 

2181 elif filename is None: 

2182 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2183 if directory is not None: 

2184 directory = ResourcePath(directory, forceDirectory=True) 

2185 # mypy doesn't think this will work but it does in python >= 3.10. 

2186 if isinstance(filename, ResourcePathExpression): # type: ignore 

2187 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2188 if not filename.isabs() and directory is not None: 

2189 potential = directory.join(filename) 

2190 exists_in_cwd = filename.exists() 

2191 exists_in_dir = potential.exists() 

2192 if exists_in_cwd and exists_in_dir: 

2193 log.warning( 

2194 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2195 "Additionally, the file exists relative to the given search directory (%s). " 

2196 "Using the export file in the given directory.", 

2197 filename, 

2198 potential, 

2199 ) 

2200 # Given they specified an explicit directory and that 

2201 # directory has the export file in it, assume that that 

2202 # is what was meant despite the file in cwd. 

2203 filename = potential 

2204 elif exists_in_dir: 

2205 filename = potential 

2206 elif not exists_in_cwd and not exists_in_dir: 

2207 # Raise early. 

2208 raise FileNotFoundError( 

2209 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2210 ) 

2211 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2212 

2213 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2214 backend = BackendClass(importStream, self.registry) 

2215 backend.register() 

2216 with self.transaction(): 

2217 backend.load( 

2218 self.datastore, 

2219 directory=directory, 

2220 transfer=transfer, 

2221 skip_dimensions=skip_dimensions, 

2222 idGenerationMode=idGenerationMode, 

2223 reuseIds=reuseIds, 

2224 ) 

2225 

2226 if isinstance(filename, ResourcePath): 

2227 # We can not use open() here at the moment because of 

2228 # DM-38589 since yaml does stream.read(8192) in a loop. 

2229 stream = io.StringIO(filename.read().decode()) 

2230 doImport(stream) 

2231 else: 

2232 doImport(filename) # type: ignore 

2233 

2234 def transfer_from( 

2235 self, 

2236 source_butler: LimitedButler, 

2237 source_refs: Iterable[DatasetRef], 

2238 transfer: str = "auto", 

2239 skip_missing: bool = True, 

2240 register_dataset_types: bool = False, 

2241 transfer_dimensions: bool = False, 

2242 ) -> collections.abc.Collection[DatasetRef]: 

2243 """Transfer datasets to this Butler from a run in another Butler. 

2244 

2245 Parameters 

2246 ---------- 

2247 source_butler : `LimitedButler` 

2248 Butler from which the datasets are to be transferred. If data IDs 

2249 in ``source_refs`` are not expanded then this has to be a full 

2250 `Butler` whose registry will be used to expand data IDs. 

2251 source_refs : iterable of `DatasetRef` 

2252 Datasets defined in the source butler that should be transferred to 

2253 this butler. 

2254 transfer : `str`, optional 

2255 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2256 skip_missing : `bool` 

2257 If `True`, datasets with no datastore artifact associated with 

2258 them are not transferred. If `False` a registry entry will be 

2259 created even if no datastore record is created (and so will 

2260 look equivalent to the dataset being unstored). 

2261 register_dataset_types : `bool` 

2262 If `True` any missing dataset types are registered. Otherwise 

2263 an exception is raised. 

2264 transfer_dimensions : `bool`, optional 

2265 If `True`, dimension record data associated with the new datasets 

2266 will be transferred. 

2267 

2268 Returns 

2269 ------- 

2270 refs : `list` of `DatasetRef` 

2271 The refs added to this Butler. 

2272 

2273 Notes 

2274 ----- 

2275 The datastore artifact has to exist for a transfer 

2276 to be made but non-existence is not an error. 

2277 

2278 Datasets that already exist in this run will be skipped. 

2279 

2280 The datasets are imported as part of a transaction, although 

2281 dataset types are registered before the transaction is started. 

2282 This means that it is possible for a dataset type to be registered 

2283 even though transfer has failed. 

2284 """ 

2285 if not self.isWriteable(): 

2286 raise TypeError("Butler is read-only.") 

2287 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2288 

2289 # Will iterate through the refs multiple times so need to convert 

2290 # to a list if this isn't a collection. 

2291 if not isinstance(source_refs, collections.abc.Collection): 

2292 source_refs = list(source_refs) 

2293 

2294 original_count = len(source_refs) 

2295 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2296 

2297 # In some situations the datastore artifact may be missing 

2298 # and we do not want that registry entry to be imported. 

2299 # Asking datastore is not sufficient, the records may have been 

2300 # purged, we have to ask for the (predicted) URI and check 

2301 # existence explicitly. Execution butler is set up exactly like 

2302 # this with no datastore records. 

2303 artifact_existence: Dict[ResourcePath, bool] = {} 

2304 if skip_missing: 

2305 dataset_existence = source_butler.datastore.mexists( 

2306 source_refs, artifact_existence=artifact_existence 

2307 ) 

2308 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2309 filtered_count = len(source_refs) 

2310 n_missing = original_count - filtered_count 

2311 log.verbose( 

2312 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2313 n_missing, 

2314 "" if n_missing == 1 else "s", 

2315 filtered_count, 

2316 ) 

2317 

2318 # Importing requires that we group the refs by dataset type and run 

2319 # before doing the import. 

2320 source_dataset_types = set() 

2321 grouped_refs = defaultdict(list) 

2322 for ref in source_refs: 

2323 grouped_refs[ref.datasetType, ref.run].append(ref) 

2324 source_dataset_types.add(ref.datasetType) 

2325 

2326 # Check to see if the dataset type in the source butler has 

2327 # the same definition in the target butler and register missing 

2328 # ones if requested. Registration must happen outside a transaction. 

2329 newly_registered_dataset_types = set() 

2330 for datasetType in source_dataset_types: 

2331 if register_dataset_types: 

2332 # Let this raise immediately if inconsistent. Continuing 

2333 # on to find additional inconsistent dataset types 

2334 # might result in additional unwanted dataset types being 

2335 # registered. 

2336 if self.registry.registerDatasetType(datasetType): 

2337 newly_registered_dataset_types.add(datasetType) 

2338 else: 

2339 # If the dataset type is missing, let it fail immediately. 

2340 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2341 if target_dataset_type != datasetType: 

2342 raise ConflictingDefinitionError( 

2343 "Source butler dataset type differs from definition" 

2344 f" in target butler: {datasetType} !=" 

2345 f" {target_dataset_type}" 

2346 ) 

2347 if newly_registered_dataset_types: 

2348 # We may have registered some even if there were inconsistencies 

2349 # but should let people know (or else remove them again). 

2350 log.log( 

2351 VERBOSE, 

2352 "Registered the following dataset types in the target Butler: %s", 

2353 ", ".join(d.name for d in newly_registered_dataset_types), 

2354 ) 

2355 else: 

2356 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2357 

2358 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2359 if transfer_dimensions: 

2360 # Collect all the dimension records for these refs. 

2361 # All dimensions are to be copied but the list of valid dimensions 

2362 # come from this butler's universe. 

2363 elements = frozenset( 

2364 element 

2365 for element in self.registry.dimensions.getStaticElements() 

2366 if element.hasTable() and element.viewOf is None 

2367 ) 

2368 dataIds = set(ref.dataId for ref in source_refs) 

2369 # This logic comes from saveDataIds. 

2370 for dataId in dataIds: 

2371 # Need an expanded record, if not expanded that we need a full 

2372 # butler with registry (allow mocks with registry too). 

2373 if not dataId.hasRecords(): 

2374 if registry := getattr(source_butler, "registry", None): 

2375 dataId = registry.expandDataId(dataId) 

2376 else: 

2377 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2378 # If this butler doesn't know about a dimension in the source 

2379 # butler things will break later. 

2380 for record in dataId.records.values(): 

2381 if record is not None and record.definition in elements: 

2382 dimension_records[record.definition].setdefault(record.dataId, record) 

2383 

2384 handled_collections: Set[str] = set() 

2385 

2386 # Do all the importing in a single transaction. 

2387 with self.transaction(): 

2388 if dimension_records: 

2389 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2390 for element, r in dimension_records.items(): 

2391 records = [r[dataId] for dataId in r] 

2392 # Assume that if the record is already present that we can 

2393 # use it without having to check that the record metadata 

2394 # is consistent. 

2395 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2396 

2397 n_imported = 0 

2398 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2399 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2400 ): 

2401 if run not in handled_collections: 

2402 # May need to create output collection. If source butler 

2403 # has a registry, ask for documentation string. 

2404 run_doc = None 

2405 if registry := getattr(source_butler, "registry", None): 

2406 run_doc = registry.getCollectionDocumentation(run) 

2407 registered = self.registry.registerRun(run, doc=run_doc) 

2408 handled_collections.add(run) 

2409 if registered: 

2410 log.log(VERBOSE, "Creating output run %s", run) 

2411 

2412 n_refs = len(refs_to_import) 

2413 log.verbose( 

2414 "Importing %d ref%s of dataset type %s into run %s", 

2415 n_refs, 

2416 "" if n_refs == 1 else "s", 

2417 datasetType.name, 

2418 run, 

2419 ) 

2420 

2421 # Assume we are using UUIDs and the source refs will match 

2422 # those imported. 

2423 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2424 assert set(imported_refs) == set(refs_to_import) 

2425 n_imported += len(imported_refs) 

2426 

2427 assert len(source_refs) == n_imported 

2428 log.verbose("Imported %d datasets into destination butler", n_imported) 

2429 

2430 # Ask the datastore to transfer. The datastore has to check that 

2431 # the source datastore is compatible with the target datastore. 

2432 accepted, rejected = self.datastore.transfer_from( 

2433 source_butler.datastore, 

2434 source_refs, 

2435 transfer=transfer, 

2436 artifact_existence=artifact_existence, 

2437 ) 

2438 if rejected: 

2439 # For now, accept the registry entries but not the files. 

2440 log.warning( 

2441 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2442 len(rejected), 

2443 len(accepted), 

2444 datasetType, 

2445 run, 

2446 ) 

2447 

2448 return source_refs 

2449 

2450 def validateConfiguration( 

2451 self, 

2452 logFailures: bool = False, 

2453 datasetTypeNames: Optional[Iterable[str]] = None, 

2454 ignore: Iterable[str] | None = None, 

2455 ) -> None: 

2456 """Validate butler configuration. 

2457 

2458 Checks that each `DatasetType` can be stored in the `Datastore`. 

2459 

2460 Parameters 

2461 ---------- 

2462 logFailures : `bool`, optional 

2463 If `True`, output a log message for every validation error 

2464 detected. 

2465 datasetTypeNames : iterable of `str`, optional 

2466 The `DatasetType` names that should be checked. This allows 

2467 only a subset to be selected. 

2468 ignore : iterable of `str`, optional 

2469 Names of DatasetTypes to skip over. This can be used to skip 

2470 known problems. If a named `DatasetType` corresponds to a 

2471 composite, all components of that `DatasetType` will also be 

2472 ignored. 

2473 

2474 Raises 

2475 ------ 

2476 ButlerValidationError 

2477 Raised if there is some inconsistency with how this Butler 

2478 is configured. 

2479 """ 

2480 if datasetTypeNames: 

2481 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2482 else: 

2483 datasetTypes = list(self.registry.queryDatasetTypes()) 

2484 

2485 # filter out anything from the ignore list 

2486 if ignore: 

2487 ignore = set(ignore) 

2488 datasetTypes = [ 

2489 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2490 ] 

2491 else: 

2492 ignore = set() 

2493 

2494 # Find all the registered instruments 

2495 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2496 

2497 # For each datasetType that has an instrument dimension, create 

2498 # a DatasetRef for each defined instrument 

2499 datasetRefs = [] 

2500 

2501 for datasetType in datasetTypes: 

2502 if "instrument" in datasetType.dimensions: 

2503 for instrument in instruments: 

2504 datasetRef = DatasetRef( 

2505 datasetType, 

2506 {"instrument": instrument}, # type: ignore 

2507 conform=False, 

2508 run="validate", 

2509 ) 

2510 datasetRefs.append(datasetRef) 

2511 

2512 entities: List[Union[DatasetType, DatasetRef]] = [] 

2513 entities.extend(datasetTypes) 

2514 entities.extend(datasetRefs) 

2515 

2516 datastoreErrorStr = None 

2517 try: 

2518 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2519 except ValidationError as e: 

2520 datastoreErrorStr = str(e) 

2521 

2522 # Also check that the LookupKeys used by the datastores match 

2523 # registry and storage class definitions 

2524 keys = self.datastore.getLookupKeys() 

2525 

2526 failedNames = set() 

2527 failedDataId = set() 

2528 for key in keys: 

2529 if key.name is not None: 

2530 if key.name in ignore: 

2531 continue 

2532 

2533 # skip if specific datasetType names were requested and this 

2534 # name does not match 

2535 if datasetTypeNames and key.name not in datasetTypeNames: 

2536 continue 

2537 

2538 # See if it is a StorageClass or a DatasetType 

2539 if key.name in self.storageClasses: 

2540 pass 

2541 else: 

2542 try: 

2543 self.registry.getDatasetType(key.name) 

2544 except KeyError: 

2545 if logFailures: 

2546 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2547 failedNames.add(key) 

2548 else: 

2549 # Dimensions are checked for consistency when the Butler 

2550 # is created and rendezvoused with a universe. 

2551 pass 

2552 

2553 # Check that the instrument is a valid instrument 

2554 # Currently only support instrument so check for that 

2555 if key.dataId: 

2556 dataIdKeys = set(key.dataId) 

2557 if set(["instrument"]) != dataIdKeys: 

2558 if logFailures: 

2559 log.critical("Key '%s' has unsupported DataId override", key) 

2560 failedDataId.add(key) 

2561 elif key.dataId["instrument"] not in instruments: 

2562 if logFailures: 

2563 log.critical("Key '%s' has unknown instrument", key) 

2564 failedDataId.add(key) 

2565 

2566 messages = [] 

2567 

2568 if datastoreErrorStr: 

2569 messages.append(datastoreErrorStr) 

2570 

2571 for failed, msg in ( 

2572 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2573 (failedDataId, "Keys with bad DataId entries: "), 

2574 ): 

2575 if failed: 

2576 msg += ", ".join(str(k) for k in failed) 

2577 messages.append(msg) 

2578 

2579 if messages: 

2580 raise ValidationError(";\n".join(messages)) 

2581 

2582 @property 

2583 def collections(self) -> Sequence[str]: 

2584 """The collections to search by default, in order 

2585 (`Sequence` [ `str` ]). 

2586 

2587 This is an alias for ``self.registry.defaults.collections``. It cannot 

2588 be set directly in isolation, but all defaults may be changed together 

2589 by assigning a new `RegistryDefaults` instance to 

2590 ``self.registry.defaults``. 

2591 """ 

2592 return self.registry.defaults.collections 

2593 

2594 @property 

2595 def run(self) -> Optional[str]: 

2596 """Name of the run this butler writes outputs to by default (`str` or 

2597 `None`). 

2598 

2599 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2600 directly in isolation, but all defaults may be changed together by 

2601 assigning a new `RegistryDefaults` instance to 

2602 ``self.registry.defaults``. 

2603 """ 

2604 return self.registry.defaults.run 

2605 

2606 @property 

2607 def dimensions(self) -> DimensionUniverse: 

2608 # Docstring inherited. 

2609 return self.registry.dimensions 

2610 

2611 registry: Registry 

2612 """The object that manages dataset metadata and relationships (`Registry`). 

2613 

2614 Most operations that don't involve reading or writing butler datasets are 

2615 accessible only via `Registry` methods. 

2616 """ 

2617 

2618 datastore: Datastore 

2619 """The object that manages actual dataset storage (`Datastore`). 

2620 

2621 Direct user access to the datastore should rarely be necessary; the primary 

2622 exception is the case where a `Datastore` implementation provides extra 

2623 functionality beyond what the base class defines. 

2624 """ 

2625 

2626 storageClasses: StorageClassFactory 

2627 """An object that maps known storage class names to objects that fully 

2628 describe them (`StorageClassFactory`). 

2629 """ 

2630 

2631 _allow_put_of_predefined_dataset: bool 

2632 """Allow a put to succeed even if there is already a registry entry for it 

2633 but not a datastore record. (`bool`)."""