Coverage for python/lsst/daf/butler/_butler.py: 8%

691 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-02 09:50 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22""" 

23Butler top level classes. 

24""" 

25from __future__ import annotations 

26 

27__all__ = ( 

28 "Butler", 

29 "ButlerValidationError", 

30) 

31 

32import collections.abc 

33import contextlib 

34import io 

35import logging 

36import numbers 

37import os 

38import uuid 

39import warnings 

40from collections import defaultdict 

41from typing import ( 

42 TYPE_CHECKING, 

43 Any, 

44 ClassVar, 

45 Counter, 

46 Dict, 

47 Iterable, 

48 Iterator, 

49 List, 

50 MutableMapping, 

51 Optional, 

52 Sequence, 

53 Set, 

54 TextIO, 

55 Tuple, 

56 Type, 

57 Union, 

58) 

59 

60from deprecated.sphinx import deprecated 

61from lsst.resources import ResourcePath, ResourcePathExpression 

62from lsst.utils import doImportType 

63from lsst.utils.introspection import get_class_of 

64from lsst.utils.logging import VERBOSE, getLogger 

65 

66from ._butlerConfig import ButlerConfig 

67from ._butlerRepoIndex import ButlerRepoIndex 

68from ._deferredDatasetHandle import DeferredDatasetHandle 

69from ._limited_butler import LimitedButler 

70from .core import ( 

71 AmbiguousDatasetError, 

72 Config, 

73 ConfigSubset, 

74 DataCoordinate, 

75 DataId, 

76 DataIdValue, 

77 DatasetIdGenEnum, 

78 DatasetRef, 

79 DatasetRefURIs, 

80 DatasetType, 

81 Datastore, 

82 Dimension, 

83 DimensionConfig, 

84 DimensionElement, 

85 DimensionRecord, 

86 DimensionUniverse, 

87 FileDataset, 

88 Progress, 

89 StorageClass, 

90 StorageClassFactory, 

91 Timespan, 

92 UnresolvedRefWarning, 

93 ValidationError, 

94) 

95from .core.repoRelocation import BUTLER_ROOT_TAG 

96from .core.utils import transactional 

97from .registry import ( 

98 CollectionType, 

99 ConflictingDefinitionError, 

100 DataIdError, 

101 MissingDatasetTypeError, 

102 Registry, 

103 RegistryConfig, 

104 RegistryDefaults, 

105) 

106from .transfers import RepoExportContext 

107 

108if TYPE_CHECKING: 

109 from lsst.resources import ResourceHandleProtocol 

110 

111log = getLogger(__name__) 

112 

113 

114class ButlerValidationError(ValidationError): 

115 """There is a problem with the Butler configuration.""" 

116 

117 pass 

118 

119 

120class Butler(LimitedButler): 

121 """Main entry point for the data access system. 

122 

123 Parameters 

124 ---------- 

125 config : `ButlerConfig`, `Config` or `str`, optional. 

126 Configuration. Anything acceptable to the 

127 `ButlerConfig` constructor. If a directory path 

128 is given the configuration will be read from a ``butler.yaml`` file in 

129 that location. If `None` is given default values will be used. 

130 butler : `Butler`, optional. 

131 If provided, construct a new Butler that uses the same registry and 

132 datastore as the given one, but with the given collection and run. 

133 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

134 arguments. 

135 collections : `str` or `Iterable` [ `str` ], optional 

136 An expression specifying the collections to be searched (in order) when 

137 reading datasets. 

138 This may be a `str` collection name or an iterable thereof. 

139 See :ref:`daf_butler_collection_expressions` for more information. 

140 These collections are not registered automatically and must be 

141 manually registered before they are used by any method, but they may be 

142 manually registered after the `Butler` is initialized. 

143 run : `str`, optional 

144 Name of the `~CollectionType.RUN` collection new datasets should be 

145 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

146 ``collections`` will be set to ``[run]``. If not `None`, this 

147 collection will automatically be registered. If this is not set (and 

148 ``writeable`` is not set either), a read-only butler will be created. 

149 searchPaths : `list` of `str`, optional 

150 Directory paths to search when calculating the full Butler 

151 configuration. Not used if the supplied config is already a 

152 `ButlerConfig`. 

153 writeable : `bool`, optional 

154 Explicitly sets whether the butler supports write operations. If not 

155 provided, a read-write butler is created if any of ``run``, ``tags``, 

156 or ``chains`` is non-empty. 

157 inferDefaults : `bool`, optional 

158 If `True` (default) infer default data ID values from the values 

159 present in the datasets in ``collections``: if all collections have the 

160 same value (or no value) for a governor dimension, that value will be 

161 the default for that dimension. Nonexistent collections are ignored. 

162 If a default value is provided explicitly for a governor dimension via 

163 ``**kwargs``, no default will be inferred for that dimension. 

164 **kwargs : `str` 

165 Default data ID key-value pairs. These may only identify "governor" 

166 dimensions like ``instrument`` and ``skymap``. 

167 

168 Examples 

169 -------- 

170 While there are many ways to control exactly how a `Butler` interacts with 

171 the collections in its `Registry`, the most common cases are still simple. 

172 

173 For a read-only `Butler` that searches one collection, do:: 

174 

175 butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) 

176 

177 For a read-write `Butler` that writes to and reads from a 

178 `~CollectionType.RUN` collection:: 

179 

180 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") 

181 

182 The `Butler` passed to a ``PipelineTask`` is often much more complex, 

183 because we want to write to one `~CollectionType.RUN` collection but read 

184 from several others (as well):: 

185 

186 butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", 

187 collections=["u/alice/DM-50000/a", 

188 "u/bob/DM-49998", 

189 "HSC/defaults"]) 

190 

191 This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. 

192 Datasets will be read first from that run (since it appears first in the 

193 chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. 

194 

195 Finally, one can always create a `Butler` with no collections:: 

196 

197 butler = Butler("/path/to/repo", writeable=True) 

198 

199 This can be extremely useful when you just want to use ``butler.registry``, 

200 e.g. for inserting dimension data or managing collections, or when the 

201 collections you want to use with the butler are not consistent. 

202 Passing ``writeable`` explicitly here is only necessary if you want to be 

203 able to make changes to the repo - usually the value for ``writeable`` can 

204 be guessed from the collection arguments provided, but it defaults to 

205 `False` when there are not collection arguments. 

206 """ 

207 

208 def __init__( 

209 self, 

210 config: Union[Config, str, None] = None, 

211 *, 

212 butler: Optional[Butler] = None, 

213 collections: Any = None, 

214 run: Optional[str] = None, 

215 searchPaths: Optional[List[str]] = None, 

216 writeable: Optional[bool] = None, 

217 inferDefaults: bool = True, 

218 **kwargs: str, 

219 ): 

220 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

221 # Load registry, datastore, etc. from config or existing butler. 

222 if butler is not None: 

223 if config is not None or searchPaths is not None or writeable is not None: 

224 raise TypeError( 

225 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

226 ) 

227 self.registry = butler.registry.copy(defaults) 

228 self.datastore = butler.datastore 

229 self.storageClasses = butler.storageClasses 

230 self._config: ButlerConfig = butler._config 

231 self._allow_put_of_predefined_dataset = butler._allow_put_of_predefined_dataset 

232 else: 

233 # Can only look for strings in the known repos list. 

234 if isinstance(config, str) and config in self.get_known_repos(): 

235 config = str(self.get_repo_uri(config)) 

236 try: 

237 self._config = ButlerConfig(config, searchPaths=searchPaths) 

238 except FileNotFoundError as e: 

239 if known := self.get_known_repos(): 

240 aliases = f"(known aliases: {', '.join(known)})" 

241 else: 

242 aliases = "(no known aliases)" 

243 raise FileNotFoundError(f"{e} {aliases}") from e 

244 self._config = ButlerConfig(config, searchPaths=searchPaths) 

245 try: 

246 if "root" in self._config: 

247 butlerRoot = self._config["root"] 

248 else: 

249 butlerRoot = self._config.configDir 

250 if writeable is None: 

251 writeable = run is not None 

252 self.registry = Registry.fromConfig( 

253 self._config, butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

254 ) 

255 self.datastore = Datastore.fromConfig( 

256 self._config, self.registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

257 ) 

258 self.storageClasses = StorageClassFactory() 

259 self.storageClasses.addFromConfig(self._config) 

260 self._allow_put_of_predefined_dataset = self._config.get( 

261 "allow_put_of_predefined_dataset", False 

262 ) 

263 except Exception: 

264 # Failures here usually mean that configuration is incomplete, 

265 # just issue an error message which includes config file URI. 

266 log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

267 raise 

268 

269 # For execution butler the datastore needs a special 

270 # dependency-inversion trick. This is not used by regular butler, 

271 # but we do not have a way to distinguish regular butler from execution 

272 # butler. 

273 self.datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

274 

275 if "run" in self._config or "collection" in self._config: 

276 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

277 

278 GENERATION: ClassVar[int] = 3 

279 """This is a Generation 3 Butler. 

280 

281 This attribute may be removed in the future, once the Generation 2 Butler 

282 interface has been fully retired; it should only be used in transitional 

283 code. 

284 """ 

285 

286 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

287 """Return DatasetType defined in registry given dataset type name.""" 

288 try: 

289 return self.registry.getDatasetType(name) 

290 except MissingDatasetTypeError: 

291 return None 

292 

293 @classmethod 

294 def get_repo_uri(cls, label: str) -> ResourcePath: 

295 """Look up the label in a butler repository index. 

296 

297 Parameters 

298 ---------- 

299 label : `str` 

300 Label of the Butler repository to look up. 

301 

302 Returns 

303 ------- 

304 uri : `lsst.resources.ResourcePath` 

305 URI to the Butler repository associated with the given label. 

306 

307 Raises 

308 ------ 

309 KeyError 

310 Raised if the label is not found in the index, or if an index 

311 can not be found at all. 

312 

313 Notes 

314 ----- 

315 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

316 information is discovered. 

317 """ 

318 return ButlerRepoIndex.get_repo_uri(label) 

319 

320 @classmethod 

321 def get_known_repos(cls) -> Set[str]: 

322 """Retrieve the list of known repository labels. 

323 

324 Returns 

325 ------- 

326 repos : `set` of `str` 

327 All the known labels. Can be empty if no index can be found. 

328 

329 Notes 

330 ----- 

331 See `~lsst.daf.butler.ButlerRepoIndex` for details on how the 

332 information is discovered. 

333 """ 

334 return ButlerRepoIndex.get_known_repos() 

335 

336 @staticmethod 

337 def makeRepo( 

338 root: ResourcePathExpression, 

339 config: Union[Config, str, None] = None, 

340 dimensionConfig: Union[Config, str, None] = None, 

341 standalone: bool = False, 

342 searchPaths: Optional[List[str]] = None, 

343 forceConfigRoot: bool = True, 

344 outfile: Optional[ResourcePathExpression] = None, 

345 overwrite: bool = False, 

346 ) -> Config: 

347 """Create an empty data repository by adding a butler.yaml config 

348 to a repository root directory. 

349 

350 Parameters 

351 ---------- 

352 root : `lsst.resources.ResourcePathExpression` 

353 Path or URI to the root location of the new repository. Will be 

354 created if it does not exist. 

355 config : `Config` or `str`, optional 

356 Configuration to write to the repository, after setting any 

357 root-dependent Registry or Datastore config options. Can not 

358 be a `ButlerConfig` or a `ConfigSubset`. If `None`, default 

359 configuration will be used. Root-dependent config options 

360 specified in this config are overwritten if ``forceConfigRoot`` 

361 is `True`. 

362 dimensionConfig : `Config` or `str`, optional 

363 Configuration for dimensions, will be used to initialize registry 

364 database. 

365 standalone : `bool` 

366 If True, write all expanded defaults, not just customized or 

367 repository-specific settings. 

368 This (mostly) decouples the repository from the default 

369 configuration, insulating it from changes to the defaults (which 

370 may be good or bad, depending on the nature of the changes). 

371 Future *additions* to the defaults will still be picked up when 

372 initializing `Butlers` to repos created with ``standalone=True``. 

373 searchPaths : `list` of `str`, optional 

374 Directory paths to search when calculating the full butler 

375 configuration. 

376 forceConfigRoot : `bool`, optional 

377 If `False`, any values present in the supplied ``config`` that 

378 would normally be reset are not overridden and will appear 

379 directly in the output config. This allows non-standard overrides 

380 of the root directory for a datastore or registry to be given. 

381 If this parameter is `True` the values for ``root`` will be 

382 forced into the resulting config if appropriate. 

383 outfile : `lss.resources.ResourcePathExpression`, optional 

384 If not-`None`, the output configuration will be written to this 

385 location rather than into the repository itself. Can be a URI 

386 string. Can refer to a directory that will be used to write 

387 ``butler.yaml``. 

388 overwrite : `bool`, optional 

389 Create a new configuration file even if one already exists 

390 in the specified output location. Default is to raise 

391 an exception. 

392 

393 Returns 

394 ------- 

395 config : `Config` 

396 The updated `Config` instance written to the repo. 

397 

398 Raises 

399 ------ 

400 ValueError 

401 Raised if a ButlerConfig or ConfigSubset is passed instead of a 

402 regular Config (as these subclasses would make it impossible to 

403 support ``standalone=False``). 

404 FileExistsError 

405 Raised if the output config file already exists. 

406 os.error 

407 Raised if the directory does not exist, exists but is not a 

408 directory, or cannot be created. 

409 

410 Notes 

411 ----- 

412 Note that when ``standalone=False`` (the default), the configuration 

413 search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

414 construct the repository should also be used to construct any Butlers 

415 to avoid configuration inconsistencies. 

416 """ 

417 if isinstance(config, (ButlerConfig, ConfigSubset)): 

418 raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

419 

420 # Ensure that the root of the repository exists or can be made 

421 root_uri = ResourcePath(root, forceDirectory=True) 

422 root_uri.mkdir() 

423 

424 config = Config(config) 

425 

426 # If we are creating a new repo from scratch with relative roots, 

427 # do not propagate an explicit root from the config file 

428 if "root" in config: 

429 del config["root"] 

430 

431 full = ButlerConfig(config, searchPaths=searchPaths) # this applies defaults 

432 imported_class = doImportType(full["datastore", "cls"]) 

433 if not issubclass(imported_class, Datastore): 

434 raise TypeError(f"Imported datastore class {full['datastore', 'cls']} is not a Datastore") 

435 datastoreClass: Type[Datastore] = imported_class 

436 datastoreClass.setConfigRoot(BUTLER_ROOT_TAG, config, full, overwrite=forceConfigRoot) 

437 

438 # if key exists in given config, parse it, otherwise parse the defaults 

439 # in the expanded config 

440 if config.get(("registry", "db")): 

441 registryConfig = RegistryConfig(config) 

442 else: 

443 registryConfig = RegistryConfig(full) 

444 defaultDatabaseUri = registryConfig.makeDefaultDatabaseUri(BUTLER_ROOT_TAG) 

445 if defaultDatabaseUri is not None: 

446 Config.updateParameters( 

447 RegistryConfig, config, full, toUpdate={"db": defaultDatabaseUri}, overwrite=forceConfigRoot 

448 ) 

449 else: 

450 Config.updateParameters(RegistryConfig, config, full, toCopy=("db",), overwrite=forceConfigRoot) 

451 

452 if standalone: 

453 config.merge(full) 

454 else: 

455 # Always expand the registry.managers section into the per-repo 

456 # config, because after the database schema is created, it's not 

457 # allowed to change anymore. Note that in the standalone=True 

458 # branch, _everything_ in the config is expanded, so there's no 

459 # need to special case this. 

460 Config.updateParameters(RegistryConfig, config, full, toMerge=("managers",), overwrite=False) 

461 configURI: ResourcePathExpression 

462 if outfile is not None: 

463 # When writing to a separate location we must include 

464 # the root of the butler repo in the config else it won't know 

465 # where to look. 

466 config["root"] = root_uri.geturl() 

467 configURI = outfile 

468 else: 

469 configURI = root_uri 

470 # Strip obscore configuration, if it is present, before writing config 

471 # to a file, obscore config will be stored in registry. 

472 if (obscore_config_key := ("registry", "managers", "obscore", "config")) in config: 

473 config_to_write = config.copy() 

474 del config_to_write[obscore_config_key] 

475 config_to_write.dumpToUri(configURI, overwrite=overwrite) 

476 # configFile attribute is updated, need to copy it to original. 

477 config.configFile = config_to_write.configFile 

478 else: 

479 config.dumpToUri(configURI, overwrite=overwrite) 

480 

481 # Create Registry and populate tables 

482 registryConfig = RegistryConfig(config.get("registry")) 

483 dimensionConfig = DimensionConfig(dimensionConfig) 

484 Registry.createFromConfig(registryConfig, dimensionConfig=dimensionConfig, butlerRoot=root_uri) 

485 

486 log.verbose("Wrote new Butler configuration file to %s", configURI) 

487 

488 return config 

489 

490 @classmethod 

491 def _unpickle( 

492 cls, 

493 config: ButlerConfig, 

494 collections: Optional[tuple[str, ...]], 

495 run: Optional[str], 

496 defaultDataId: Dict[str, str], 

497 writeable: bool, 

498 ) -> Butler: 

499 """Callable used to unpickle a Butler. 

500 

501 We prefer not to use ``Butler.__init__`` directly so we can force some 

502 of its many arguments to be keyword-only (note that ``__reduce__`` 

503 can only invoke callables with positional arguments). 

504 

505 Parameters 

506 ---------- 

507 config : `ButlerConfig` 

508 Butler configuration, already coerced into a true `ButlerConfig` 

509 instance (and hence after any search paths for overrides have been 

510 utilized). 

511 collections : `tuple` [ `str` ] 

512 Names of the default collections to read from. 

513 run : `str`, optional 

514 Name of the default `~CollectionType.RUN` collection to write to. 

515 defaultDataId : `dict` [ `str`, `str` ] 

516 Default data ID values. 

517 writeable : `bool` 

518 Whether the Butler should support write operations. 

519 

520 Returns 

521 ------- 

522 butler : `Butler` 

523 A new `Butler` instance. 

524 """ 

525 # MyPy doesn't recognize that the kwargs below are totally valid; it 

526 # seems to think '**defaultDataId* is a _positional_ argument! 

527 return cls( 

528 config=config, 

529 collections=collections, 

530 run=run, 

531 writeable=writeable, 

532 **defaultDataId, # type: ignore 

533 ) 

534 

535 def __reduce__(self) -> tuple: 

536 """Support pickling.""" 

537 return ( 

538 Butler._unpickle, 

539 ( 

540 self._config, 

541 self.collections, 

542 self.run, 

543 self.registry.defaults.dataId.byName(), 

544 self.registry.isWriteable(), 

545 ), 

546 ) 

547 

548 def __str__(self) -> str: 

549 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

550 self.collections, self.run, self.datastore, self.registry 

551 ) 

552 

553 def isWriteable(self) -> bool: 

554 """Return `True` if this `Butler` supports write operations.""" 

555 return self.registry.isWriteable() 

556 

557 @contextlib.contextmanager 

558 def transaction(self) -> Iterator[None]: 

559 """Context manager supporting `Butler` transactions. 

560 

561 Transactions can be nested. 

562 """ 

563 with self.registry.transaction(): 

564 with self.datastore.transaction(): 

565 yield 

566 

567 def _standardizeArgs( 

568 self, 

569 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

570 dataId: Optional[DataId] = None, 

571 for_put: bool = True, 

572 **kwargs: Any, 

573 ) -> Tuple[DatasetType, Optional[DataId]]: 

574 """Standardize the arguments passed to several Butler APIs. 

575 

576 Parameters 

577 ---------- 

578 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

579 When `DatasetRef` the `dataId` should be `None`. 

580 Otherwise the `DatasetType` or name thereof. 

581 dataId : `dict` or `DataCoordinate` 

582 A `dict` of `Dimension` link name, value pairs that label the 

583 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

584 should be provided as the second argument. 

585 for_put : `bool`, optional 

586 If `True` this call is invoked as part of a `Butler.put()`. 

587 Otherwise it is assumed to be part of a `Butler.get()`. This 

588 parameter is only relevant if there is dataset type 

589 inconsistency. 

590 **kwargs 

591 Additional keyword arguments used to augment or construct a 

592 `DataCoordinate`. See `DataCoordinate.standardize` 

593 parameters. 

594 

595 Returns 

596 ------- 

597 datasetType : `DatasetType` 

598 A `DatasetType` instance extracted from ``datasetRefOrType``. 

599 dataId : `dict` or `DataId`, optional 

600 Argument that can be used (along with ``kwargs``) to construct a 

601 `DataId`. 

602 

603 Notes 

604 ----- 

605 Butler APIs that conceptually need a DatasetRef also allow passing a 

606 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

607 keyword arguments that can be used to construct one) separately. This 

608 method accepts those arguments and always returns a true `DatasetType` 

609 and a `DataId` or `dict`. 

610 

611 Standardization of `dict` vs `DataId` is best handled by passing the 

612 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

613 generally similarly flexible. 

614 """ 

615 externalDatasetType: Optional[DatasetType] = None 

616 internalDatasetType: Optional[DatasetType] = None 

617 if isinstance(datasetRefOrType, DatasetRef): 

618 if dataId is not None or kwargs: 

619 raise ValueError("DatasetRef given, cannot use dataId as well") 

620 externalDatasetType = datasetRefOrType.datasetType 

621 dataId = datasetRefOrType.dataId 

622 else: 

623 # Don't check whether DataId is provided, because Registry APIs 

624 # can usually construct a better error message when it wasn't. 

625 if isinstance(datasetRefOrType, DatasetType): 

626 externalDatasetType = datasetRefOrType 

627 else: 

628 internalDatasetType = self.registry.getDatasetType(datasetRefOrType) 

629 

630 # Check that they are self-consistent 

631 if externalDatasetType is not None: 

632 internalDatasetType = self.registry.getDatasetType(externalDatasetType.name) 

633 if externalDatasetType != internalDatasetType: 

634 # We can allow differences if they are compatible, depending 

635 # on whether this is a get or a put. A get requires that 

636 # the python type associated with the datastore can be 

637 # converted to the user type. A put requires that the user 

638 # supplied python type can be converted to the internal 

639 # type expected by registry. 

640 relevantDatasetType = internalDatasetType 

641 if for_put: 

642 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

643 else: 

644 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

645 relevantDatasetType = externalDatasetType 

646 if not is_compatible: 

647 raise ValueError( 

648 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

649 f"registry definition ({internalDatasetType})" 

650 ) 

651 # Override the internal definition. 

652 internalDatasetType = relevantDatasetType 

653 

654 assert internalDatasetType is not None 

655 return internalDatasetType, dataId 

656 

657 def _rewrite_data_id( 

658 self, dataId: Optional[DataId], datasetType: DatasetType, **kwargs: Any 

659 ) -> Tuple[Optional[DataId], Dict[str, Any]]: 

660 """Rewrite a data ID taking into account dimension records. 

661 

662 Take a Data ID and keyword args and rewrite it if necessary to 

663 allow the user to specify dimension records rather than dimension 

664 primary values. 

665 

666 This allows a user to include a dataId dict with keys of 

667 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

668 the integer exposure ID. It also allows a string to be given 

669 for a dimension value rather than the integer ID if that is more 

670 convenient. For example, rather than having to specifyin the 

671 detector with ``detector.full_name``, a string given for ``detector`` 

672 will be interpreted as the full name and converted to the integer 

673 value. 

674 

675 Keyword arguments can also use strings for dimensions like detector 

676 and exposure but python does not allow them to include ``.`` and 

677 so the ``exposure.day_obs`` syntax can not be used in a keyword 

678 argument. 

679 

680 Parameters 

681 ---------- 

682 dataId : `dict` or `DataCoordinate` 

683 A `dict` of `Dimension` link name, value pairs that will label the 

684 `DatasetRef` within a Collection. 

685 datasetType : `DatasetType` 

686 The dataset type associated with this dataId. Required to 

687 determine the relevant dimensions. 

688 **kwargs 

689 Additional keyword arguments used to augment or construct a 

690 `DataId`. See `DataId` parameters. 

691 

692 Returns 

693 ------- 

694 dataId : `dict` or `DataCoordinate` 

695 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

696 no keyword arguments, the original dataId will be returned 

697 unchanged. 

698 **kwargs : `dict` 

699 Any unused keyword arguments (would normally be empty dict). 

700 """ 

701 # Do nothing if we have a standalone DataCoordinate. 

702 if isinstance(dataId, DataCoordinate) and not kwargs: 

703 return dataId, kwargs 

704 

705 # Process dimension records that are using record information 

706 # rather than ids 

707 newDataId: Dict[str, DataIdValue] = {} 

708 byRecord: Dict[str, Dict[str, Any]] = defaultdict(dict) 

709 

710 # if all the dataId comes from keyword parameters we do not need 

711 # to do anything here because they can't be of the form 

712 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

713 if dataId: 

714 for k, v in dataId.items(): 

715 # If we have a Dimension we do not need to do anything 

716 # because it cannot be a compound key. 

717 if isinstance(k, str) and "." in k: 

718 # Someone is using a more human-readable dataId 

719 dimensionName, record = k.split(".", 1) 

720 byRecord[dimensionName][record] = v 

721 elif isinstance(k, Dimension): 

722 newDataId[k.name] = v 

723 else: 

724 newDataId[k] = v 

725 

726 # Go through the updated dataId and check the type in case someone is 

727 # using an alternate key. We have already filtered out the compound 

728 # keys dimensions.record format. 

729 not_dimensions = {} 

730 

731 # Will need to look in the dataId and the keyword arguments 

732 # and will remove them if they need to be fixed or are unrecognized. 

733 for dataIdDict in (newDataId, kwargs): 

734 # Use a list so we can adjust the dict safely in the loop 

735 for dimensionName in list(dataIdDict): 

736 value = dataIdDict[dimensionName] 

737 try: 

738 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

739 except KeyError: 

740 # This is not a real dimension 

741 not_dimensions[dimensionName] = value 

742 del dataIdDict[dimensionName] 

743 continue 

744 

745 # Convert an integral type to an explicit int to simplify 

746 # comparisons here 

747 if isinstance(value, numbers.Integral): 

748 value = int(value) 

749 

750 if not isinstance(value, dimension.primaryKey.getPythonType()): 

751 for alternate in dimension.alternateKeys: 

752 if isinstance(value, alternate.getPythonType()): 

753 byRecord[dimensionName][alternate.name] = value 

754 del dataIdDict[dimensionName] 

755 log.debug( 

756 "Converting dimension %s to %s.%s=%s", 

757 dimensionName, 

758 dimensionName, 

759 alternate.name, 

760 value, 

761 ) 

762 break 

763 else: 

764 log.warning( 

765 "Type mismatch found for value '%r' provided for dimension %s. " 

766 "Could not find matching alternative (primary key has type %s) " 

767 "so attempting to use as-is.", 

768 value, 

769 dimensionName, 

770 dimension.primaryKey.getPythonType(), 

771 ) 

772 

773 # By this point kwargs and newDataId should only include valid 

774 # dimensions. Merge kwargs in to the new dataId and log if there 

775 # are dimensions in both (rather than calling update). 

776 for k, v in kwargs.items(): 

777 if k in newDataId and newDataId[k] != v: 

778 log.debug( 

779 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

780 ) 

781 newDataId[k] = v 

782 # No need to retain any values in kwargs now. 

783 kwargs = {} 

784 

785 # If we have some unrecognized dimensions we have to try to connect 

786 # them to records in other dimensions. This is made more complicated 

787 # by some dimensions having records with clashing names. A mitigation 

788 # is that we can tell by this point which dimensions are missing 

789 # for the DatasetType but this does not work for calibrations 

790 # where additional dimensions can be used to constrain the temporal 

791 # axis. 

792 if not_dimensions: 

793 # Search for all dimensions even if we have been given a value 

794 # explicitly. In some cases records are given as well as the 

795 # actually dimension and this should not be an error if they 

796 # match. 

797 mandatoryDimensions = datasetType.dimensions.names # - provided 

798 

799 candidateDimensions: Set[str] = set() 

800 candidateDimensions.update(mandatoryDimensions) 

801 

802 # For calibrations we may well be needing temporal dimensions 

803 # so rather than always including all dimensions in the scan 

804 # restrict things a little. It is still possible for there 

805 # to be confusion over day_obs in visit vs exposure for example. 

806 # If we are not searching calibration collections things may 

807 # fail but they are going to fail anyway because of the 

808 # ambiguousness of the dataId... 

809 if datasetType.isCalibration(): 

810 for dim in self.registry.dimensions.getStaticDimensions(): 

811 if dim.temporal: 

812 candidateDimensions.add(str(dim)) 

813 

814 # Look up table for the first association with a dimension 

815 guessedAssociation: Dict[str, Dict[str, Any]] = defaultdict(dict) 

816 

817 # Keep track of whether an item is associated with multiple 

818 # dimensions. 

819 counter: Counter[str] = Counter() 

820 assigned: Dict[str, Set[str]] = defaultdict(set) 

821 

822 # Go through the missing dimensions and associate the 

823 # given names with records within those dimensions 

824 matched_dims = set() 

825 for dimensionName in candidateDimensions: 

826 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

827 fields = dimension.metadata.names | dimension.uniqueKeys.names 

828 for field in not_dimensions: 

829 if field in fields: 

830 guessedAssociation[dimensionName][field] = not_dimensions[field] 

831 counter[dimensionName] += 1 

832 assigned[field].add(dimensionName) 

833 matched_dims.add(field) 

834 

835 # Calculate the fields that matched nothing. 

836 never_found = set(not_dimensions) - matched_dims 

837 

838 if never_found: 

839 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

840 

841 # There is a chance we have allocated a single dataId item 

842 # to multiple dimensions. Need to decide which should be retained. 

843 # For now assume that the most popular alternative wins. 

844 # This means that day_obs with seq_num will result in 

845 # exposure.day_obs and not visit.day_obs 

846 # Also prefer an explicitly missing dimension over an inferred 

847 # temporal dimension. 

848 for fieldName, assignedDimensions in assigned.items(): 

849 if len(assignedDimensions) > 1: 

850 # Pick the most popular (preferring mandatory dimensions) 

851 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

852 if requiredButMissing: 

853 candidateDimensions = requiredButMissing 

854 else: 

855 candidateDimensions = assignedDimensions 

856 

857 # If this is a choice between visit and exposure and 

858 # neither was a required part of the dataset type, 

859 # (hence in this branch) always prefer exposure over 

860 # visit since exposures are always defined and visits 

861 # are defined from exposures. 

862 if candidateDimensions == {"exposure", "visit"}: 

863 candidateDimensions = {"exposure"} 

864 

865 # Select the relevant items and get a new restricted 

866 # counter. 

867 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

868 duplicatesCounter: Counter[str] = Counter() 

869 duplicatesCounter.update(theseCounts) 

870 

871 # Choose the most common. If they are equally common 

872 # we will pick the one that was found first. 

873 # Returns a list of tuples 

874 selected = duplicatesCounter.most_common(1)[0][0] 

875 

876 log.debug( 

877 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

878 " Removed ambiguity by choosing dimension %s.", 

879 fieldName, 

880 ", ".join(assignedDimensions), 

881 selected, 

882 ) 

883 

884 for candidateDimension in assignedDimensions: 

885 if candidateDimension != selected: 

886 del guessedAssociation[candidateDimension][fieldName] 

887 

888 # Update the record look up dict with the new associations 

889 for dimensionName, values in guessedAssociation.items(): 

890 if values: # A dict might now be empty 

891 log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) 

892 byRecord[dimensionName].update(values) 

893 

894 if byRecord: 

895 # Some record specifiers were found so we need to convert 

896 # them to the Id form 

897 for dimensionName, values in byRecord.items(): 

898 if dimensionName in newDataId: 

899 log.debug( 

900 "DataId specified explicit %s dimension value of %s in addition to" 

901 " general record specifiers for it of %s. Ignoring record information.", 

902 dimensionName, 

903 newDataId[dimensionName], 

904 str(values), 

905 ) 

906 # Get the actual record and compare with these values. 

907 try: 

908 recs = list(self.registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

909 except DataIdError: 

910 raise ValueError( 

911 f"Could not find dimension '{dimensionName}'" 

912 f" with dataId {newDataId} as part of comparing with" 

913 f" record values {byRecord[dimensionName]}" 

914 ) from None 

915 if len(recs) == 1: 

916 errmsg: List[str] = [] 

917 for k, v in values.items(): 

918 if (recval := getattr(recs[0], k)) != v: 

919 errmsg.append(f"{k}({recval} != {v})") 

920 if errmsg: 

921 raise ValueError( 

922 f"Dimension {dimensionName} in dataId has explicit value" 

923 " inconsistent with records: " + ", ".join(errmsg) 

924 ) 

925 else: 

926 # Multiple matches for an explicit dimension 

927 # should never happen but let downstream complain. 

928 pass 

929 continue 

930 

931 # Build up a WHERE expression 

932 bind = {k: v for k, v in values.items()} 

933 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

934 

935 # Hopefully we get a single record that matches 

936 records = set( 

937 self.registry.queryDimensionRecords( 

938 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

939 ) 

940 ) 

941 

942 if len(records) != 1: 

943 if len(records) > 1: 

944 # visit can have an ambiguous answer without involving 

945 # visit_system. The default visit_system is defined 

946 # by the instrument. 

947 if ( 

948 dimensionName == "visit" 

949 and "visit_system_membership" in self.registry.dimensions 

950 and "visit_system" in self.registry.dimensions["instrument"].metadata 

951 ): 

952 instrument_records = list( 

953 self.registry.queryDimensionRecords( 

954 "instrument", 

955 dataId=newDataId, 

956 **kwargs, 

957 ) 

958 ) 

959 if len(instrument_records) == 1: 

960 visit_system = instrument_records[0].visit_system 

961 if visit_system is None: 

962 # Set to a value that will never match. 

963 visit_system = -1 

964 

965 # Look up each visit in the 

966 # visit_system_membership records. 

967 for rec in records: 

968 membership = list( 

969 self.registry.queryDimensionRecords( 

970 # Use bind to allow zero results. 

971 # This is a fully-specified query. 

972 "visit_system_membership", 

973 where="instrument = inst AND visit_system = system AND visit = v", 

974 bind=dict( 

975 inst=instrument_records[0].name, system=visit_system, v=rec.id 

976 ), 

977 ) 

978 ) 

979 if membership: 

980 # This record is the right answer. 

981 records = set([rec]) 

982 break 

983 

984 # The ambiguity may have been resolved so check again. 

985 if len(records) > 1: 

986 log.debug("Received %d records from constraints of %s", len(records), str(values)) 

987 for r in records: 

988 log.debug("- %s", str(r)) 

989 raise ValueError( 

990 f"DataId specification for dimension {dimensionName} is not" 

991 f" uniquely constrained to a single dataset by {values}." 

992 f" Got {len(records)} results." 

993 ) 

994 else: 

995 raise ValueError( 

996 f"DataId specification for dimension {dimensionName} matched no" 

997 f" records when constrained by {values}" 

998 ) 

999 

1000 # Get the primary key from the real dimension object 

1001 dimension = self.registry.dimensions.getStaticDimensions()[dimensionName] 

1002 if not isinstance(dimension, Dimension): 

1003 raise RuntimeError( 

1004 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

1005 ) 

1006 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

1007 

1008 return newDataId, kwargs 

1009 

1010 def _findDatasetRef( 

1011 self, 

1012 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1013 dataId: Optional[DataId] = None, 

1014 *, 

1015 collections: Any = None, 

1016 allowUnresolved: bool = False, 

1017 **kwargs: Any, 

1018 ) -> DatasetRef: 

1019 """Shared logic for methods that start with a search for a dataset in 

1020 the registry. 

1021 

1022 Parameters 

1023 ---------- 

1024 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1025 When `DatasetRef` the `dataId` should be `None`. 

1026 Otherwise the `DatasetType` or name thereof. 

1027 dataId : `dict` or `DataCoordinate`, optional 

1028 A `dict` of `Dimension` link name, value pairs that label the 

1029 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1030 should be provided as the first argument. 

1031 collections : Any, optional 

1032 Collections to be searched, overriding ``self.collections``. 

1033 Can be any of the types supported by the ``collections`` argument 

1034 to butler construction. 

1035 allowUnresolved : `bool`, optional 

1036 If `True`, return an unresolved `DatasetRef` if finding a resolved 

1037 one in the `Registry` fails. Defaults to `False`. 

1038 **kwargs 

1039 Additional keyword arguments used to augment or construct a 

1040 `DataId`. See `DataId` parameters. 

1041 

1042 Returns 

1043 ------- 

1044 ref : `DatasetRef` 

1045 A reference to the dataset identified by the given arguments. 

1046 This can be the same dataset reference as given if it was 

1047 resolved. 

1048 

1049 Raises 

1050 ------ 

1051 LookupError 

1052 Raised if no matching dataset exists in the `Registry` (and 

1053 ``allowUnresolved is False``). 

1054 ValueError 

1055 Raised if a resolved `DatasetRef` was passed as an input, but it 

1056 differs from the one found in the registry. 

1057 TypeError 

1058 Raised if no collections were provided. 

1059 """ 

1060 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

1061 if isinstance(datasetRefOrType, DatasetRef): 

1062 idNumber = datasetRefOrType.id 

1063 # This is a resolved ref, return it immediately. 

1064 if idNumber: 

1065 return datasetRefOrType 

1066 else: 

1067 idNumber = None 

1068 timespan: Optional[Timespan] = None 

1069 

1070 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1071 

1072 if datasetType.isCalibration(): 

1073 # Because this is a calibration dataset, first try to make a 

1074 # standardize the data ID without restricting the dimensions to 

1075 # those of the dataset type requested, because there may be extra 

1076 # dimensions that provide temporal information for a validity-range 

1077 # lookup. 

1078 dataId = DataCoordinate.standardize( 

1079 dataId, universe=self.registry.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1080 ) 

1081 if dataId.graph.temporal: 

1082 dataId = self.registry.expandDataId(dataId) 

1083 timespan = dataId.timespan 

1084 else: 

1085 # Standardize the data ID to just the dimensions of the dataset 

1086 # type instead of letting registry.findDataset do it, so we get the 

1087 # result even if no dataset is found. 

1088 dataId = DataCoordinate.standardize( 

1089 dataId, graph=datasetType.dimensions, defaults=self.registry.defaults.dataId, **kwargs 

1090 ) 

1091 # Always lookup the DatasetRef, even if one is given, to ensure it is 

1092 # present in the current collection. 

1093 ref = self.registry.findDataset(datasetType, dataId, collections=collections, timespan=timespan) 

1094 if ref is None: 

1095 if allowUnresolved: 

1096 with warnings.catch_warnings(): 

1097 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

1098 return DatasetRef(datasetType, dataId) 

1099 else: 

1100 if collections is None: 

1101 collections = self.registry.defaults.collections 

1102 raise LookupError( 

1103 f"Dataset {datasetType.name} with data ID {dataId} " 

1104 f"could not be found in collections {collections}." 

1105 ) 

1106 if idNumber is not None and idNumber != ref.id: 

1107 if collections is None: 

1108 collections = self.registry.defaults.collections 

1109 raise ValueError( 

1110 f"DatasetRef.id provided ({idNumber}) does not match " 

1111 f"id ({ref.id}) in registry in collections {collections}." 

1112 ) 

1113 if datasetType != ref.datasetType: 

1114 # If they differ it is because the user explicitly specified 

1115 # a compatible dataset type to this call rather than using the 

1116 # registry definition. The DatasetRef must therefore be recreated 

1117 # using the user definition such that the expected type is 

1118 # returned. 

1119 ref = DatasetRef(datasetType, ref.dataId, run=ref.run, id=ref.id) 

1120 

1121 return ref 

1122 

1123 @transactional 

1124 @deprecated( 

1125 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

1126 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

1127 " were relying on the run parameter to determine the run." 

1128 " Will be removed after v27.0.", 

1129 version="v26.0", 

1130 category=FutureWarning, 

1131 ) 

1132 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

1133 # Docstring inherited. 

1134 return self.put(obj, ref) 

1135 

1136 @transactional 

1137 def put( 

1138 self, 

1139 obj: Any, 

1140 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1141 /, 

1142 dataId: Optional[DataId] = None, 

1143 *, 

1144 run: Optional[str] = None, 

1145 **kwargs: Any, 

1146 ) -> DatasetRef: 

1147 """Store and register a dataset. 

1148 

1149 Parameters 

1150 ---------- 

1151 obj : `object` 

1152 The dataset. 

1153 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1154 When `DatasetRef` is provided, ``dataId`` should be `None`. 

1155 Otherwise the `DatasetType` or name thereof. If a fully resolved 

1156 `DatasetRef` is given the run and ID are used directly. 

1157 dataId : `dict` or `DataCoordinate` 

1158 A `dict` of `Dimension` link name, value pairs that label the 

1159 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1160 should be provided as the second argument. 

1161 run : `str`, optional 

1162 The name of the run the dataset should be added to, overriding 

1163 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

1164 **kwargs 

1165 Additional keyword arguments used to augment or construct a 

1166 `DataCoordinate`. See `DataCoordinate.standardize` 

1167 parameters. Not used if a resolve `DatasetRef` is provided. 

1168 

1169 Returns 

1170 ------- 

1171 ref : `DatasetRef` 

1172 A reference to the stored dataset, updated with the correct id if 

1173 given. 

1174 

1175 Raises 

1176 ------ 

1177 TypeError 

1178 Raised if the butler is read-only or if no run has been provided. 

1179 """ 

1180 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1181 # This is a direct put of predefined DatasetRef. 

1182 log.debug("Butler put direct: %s", datasetRefOrType) 

1183 (imported_ref,) = self.registry._importDatasets( 

1184 [datasetRefOrType], 

1185 expand=True, 

1186 ) 

1187 if imported_ref.id != datasetRefOrType.getCheckedId(): 

1188 raise RuntimeError("This registry configuration does not support direct put of ref.") 

1189 self.datastore.put(obj, datasetRefOrType) 

1190 return datasetRefOrType 

1191 

1192 log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

1193 if not self.isWriteable(): 

1194 raise TypeError("Butler is read-only.") 

1195 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

1196 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1197 raise ValueError("DatasetRef must not be in registry, must have None id") 

1198 

1199 # Handle dimension records in dataId 

1200 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

1201 

1202 # Add Registry Dataset entry. 

1203 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

1204 

1205 # For an execution butler the datasets will be pre-defined. 

1206 # If the butler is configured that way datasets should only be inserted 

1207 # if they do not already exist in registry. Trying and catching 

1208 # ConflictingDefinitionError will not work because the transaction 

1209 # will be corrupted. Instead, in this mode always check first. 

1210 ref = None 

1211 ref_is_predefined = False 

1212 if self._allow_put_of_predefined_dataset: 

1213 # Get the matching ref for this run. 

1214 ref = self.registry.findDataset(datasetType, collections=run, dataId=dataId) 

1215 

1216 if ref: 

1217 # Must be expanded form for datastore templating 

1218 dataId = self.registry.expandDataId(dataId, graph=datasetType.dimensions) 

1219 ref = ref.expanded(dataId) 

1220 ref_is_predefined = True 

1221 

1222 if not ref: 

1223 (ref,) = self.registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

1224 

1225 # If the ref is predefined it is possible that the datastore also 

1226 # has the record. Asking datastore to put it again will result in 

1227 # the artifact being recreated, overwriting previous, then will cause 

1228 # a failure in writing the record which will cause the artifact 

1229 # to be removed. Much safer to ask first before attempting to 

1230 # overwrite. Race conditions should not be an issue for the 

1231 # execution butler environment. 

1232 if ref_is_predefined: 

1233 if self.datastore.knows(ref): 

1234 raise ConflictingDefinitionError(f"Dataset associated {ref} already exists.") 

1235 

1236 self.datastore.put(obj, ref) 

1237 

1238 return ref 

1239 

1240 @deprecated( 

1241 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

1242 " Please use Butler.get(). Will be removed after v27.0.", 

1243 version="v26.0", 

1244 category=FutureWarning, 

1245 ) 

1246 def getDirect( 

1247 self, 

1248 ref: DatasetRef, 

1249 *, 

1250 parameters: Optional[Dict[str, Any]] = None, 

1251 storageClass: Optional[Union[StorageClass, str]] = None, 

1252 ) -> Any: 

1253 """Retrieve a stored dataset. 

1254 

1255 Parameters 

1256 ---------- 

1257 ref : `DatasetRef` 

1258 Resolved reference to an already stored dataset. 

1259 parameters : `dict` 

1260 Additional StorageClass-defined options to control reading, 

1261 typically used to efficiently read only a subset of the dataset. 

1262 storageClass : `StorageClass` or `str`, optional 

1263 The storage class to be used to override the Python type 

1264 returned by this method. By default the returned type matches 

1265 the dataset type definition for this dataset. Specifying a 

1266 read `StorageClass` can force a different type to be returned. 

1267 This type must be compatible with the original type. 

1268 

1269 Returns 

1270 ------- 

1271 obj : `object` 

1272 The dataset. 

1273 """ 

1274 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1275 

1276 @deprecated( 

1277 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1278 "Please use Butler.getDeferred(). Will be removed after v27.0.", 

1279 version="v26.0", 

1280 category=FutureWarning, 

1281 ) 

1282 def getDirectDeferred( 

1283 self, 

1284 ref: DatasetRef, 

1285 *, 

1286 parameters: Union[dict, None] = None, 

1287 storageClass: str | StorageClass | None = None, 

1288 ) -> DeferredDatasetHandle: 

1289 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1290 from a resolved `DatasetRef`. 

1291 

1292 Parameters 

1293 ---------- 

1294 ref : `DatasetRef` 

1295 Resolved reference to an already stored dataset. 

1296 parameters : `dict` 

1297 Additional StorageClass-defined options to control reading, 

1298 typically used to efficiently read only a subset of the dataset. 

1299 storageClass : `StorageClass` or `str`, optional 

1300 The storage class to be used to override the Python type 

1301 returned by this method. By default the returned type matches 

1302 the dataset type definition for this dataset. Specifying a 

1303 read `StorageClass` can force a different type to be returned. 

1304 This type must be compatible with the original type. 

1305 

1306 Returns 

1307 ------- 

1308 obj : `DeferredDatasetHandle` 

1309 A handle which can be used to retrieve a dataset at a later time. 

1310 

1311 Raises 

1312 ------ 

1313 AmbiguousDatasetError 

1314 Raised if ``ref.id is None``, i.e. the reference is unresolved. 

1315 """ 

1316 if ref.id is None: 

1317 raise AmbiguousDatasetError( 

1318 f"Dataset of type {ref.datasetType.name} with data ID {ref.dataId} is not resolved." 

1319 ) 

1320 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1321 

1322 def getDeferred( 

1323 self, 

1324 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1325 /, 

1326 dataId: Optional[DataId] = None, 

1327 *, 

1328 parameters: Union[dict, None] = None, 

1329 collections: Any = None, 

1330 storageClass: str | StorageClass | None = None, 

1331 **kwargs: Any, 

1332 ) -> DeferredDatasetHandle: 

1333 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1334 after an immediate registry lookup. 

1335 

1336 Parameters 

1337 ---------- 

1338 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1339 When `DatasetRef` the `dataId` should be `None`. 

1340 Otherwise the `DatasetType` or name thereof. 

1341 dataId : `dict` or `DataCoordinate`, optional 

1342 A `dict` of `Dimension` link name, value pairs that label the 

1343 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1344 should be provided as the first argument. 

1345 parameters : `dict` 

1346 Additional StorageClass-defined options to control reading, 

1347 typically used to efficiently read only a subset of the dataset. 

1348 collections : Any, optional 

1349 Collections to be searched, overriding ``self.collections``. 

1350 Can be any of the types supported by the ``collections`` argument 

1351 to butler construction. 

1352 storageClass : `StorageClass` or `str`, optional 

1353 The storage class to be used to override the Python type 

1354 returned by this method. By default the returned type matches 

1355 the dataset type definition for this dataset. Specifying a 

1356 read `StorageClass` can force a different type to be returned. 

1357 This type must be compatible with the original type. 

1358 **kwargs 

1359 Additional keyword arguments used to augment or construct a 

1360 `DataId`. See `DataId` parameters. 

1361 

1362 Returns 

1363 ------- 

1364 obj : `DeferredDatasetHandle` 

1365 A handle which can be used to retrieve a dataset at a later time. 

1366 

1367 Raises 

1368 ------ 

1369 LookupError 

1370 Raised if no matching dataset exists in the `Registry` (and 

1371 ``allowUnresolved is False``). 

1372 ValueError 

1373 Raised if a resolved `DatasetRef` was passed as an input, but it 

1374 differs from the one found in the registry. 

1375 TypeError 

1376 Raised if no collections were provided. 

1377 """ 

1378 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1379 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1380 

1381 def get( 

1382 self, 

1383 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1384 /, 

1385 dataId: Optional[DataId] = None, 

1386 *, 

1387 parameters: Optional[Dict[str, Any]] = None, 

1388 collections: Any = None, 

1389 storageClass: Optional[Union[StorageClass, str]] = None, 

1390 **kwargs: Any, 

1391 ) -> Any: 

1392 """Retrieve a stored dataset. 

1393 

1394 Parameters 

1395 ---------- 

1396 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1397 When `DatasetRef` the `dataId` should be `None`. 

1398 Otherwise the `DatasetType` or name thereof. 

1399 If a resolved `DatasetRef`, the associated dataset 

1400 is returned directly without additional querying. 

1401 dataId : `dict` or `DataCoordinate` 

1402 A `dict` of `Dimension` link name, value pairs that label the 

1403 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1404 should be provided as the first argument. 

1405 parameters : `dict` 

1406 Additional StorageClass-defined options to control reading, 

1407 typically used to efficiently read only a subset of the dataset. 

1408 collections : Any, optional 

1409 Collections to be searched, overriding ``self.collections``. 

1410 Can be any of the types supported by the ``collections`` argument 

1411 to butler construction. 

1412 storageClass : `StorageClass` or `str`, optional 

1413 The storage class to be used to override the Python type 

1414 returned by this method. By default the returned type matches 

1415 the dataset type definition for this dataset. Specifying a 

1416 read `StorageClass` can force a different type to be returned. 

1417 This type must be compatible with the original type. 

1418 **kwargs 

1419 Additional keyword arguments used to augment or construct a 

1420 `DataCoordinate`. See `DataCoordinate.standardize` 

1421 parameters. 

1422 

1423 Returns 

1424 ------- 

1425 obj : `object` 

1426 The dataset. 

1427 

1428 Raises 

1429 ------ 

1430 LookupError 

1431 Raised if no matching dataset exists in the `Registry`. 

1432 TypeError 

1433 Raised if no collections were provided. 

1434 

1435 Notes 

1436 ----- 

1437 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1438 this method requires that the given data ID include temporal dimensions 

1439 beyond the dimensions of the dataset type itself, in order to find the 

1440 dataset with the appropriate validity range. For example, a "bias" 

1441 dataset with native dimensions ``{instrument, detector}`` could be 

1442 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1443 ``exposure`` is a temporal dimension. 

1444 """ 

1445 log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1446 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1447 return self.datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1448 

1449 def getURIs( 

1450 self, 

1451 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1452 /, 

1453 dataId: Optional[DataId] = None, 

1454 *, 

1455 predict: bool = False, 

1456 collections: Any = None, 

1457 run: Optional[str] = None, 

1458 **kwargs: Any, 

1459 ) -> DatasetRefURIs: 

1460 """Returns the URIs associated with the dataset. 

1461 

1462 Parameters 

1463 ---------- 

1464 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1465 When `DatasetRef` the `dataId` should be `None`. 

1466 Otherwise the `DatasetType` or name thereof. 

1467 dataId : `dict` or `DataCoordinate` 

1468 A `dict` of `Dimension` link name, value pairs that label the 

1469 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1470 should be provided as the first argument. 

1471 predict : `bool` 

1472 If `True`, allow URIs to be returned of datasets that have not 

1473 been written. 

1474 collections : Any, optional 

1475 Collections to be searched, overriding ``self.collections``. 

1476 Can be any of the types supported by the ``collections`` argument 

1477 to butler construction. 

1478 run : `str`, optional 

1479 Run to use for predictions, overriding ``self.run``. 

1480 **kwargs 

1481 Additional keyword arguments used to augment or construct a 

1482 `DataCoordinate`. See `DataCoordinate.standardize` 

1483 parameters. 

1484 

1485 Returns 

1486 ------- 

1487 uris : `DatasetRefURIs` 

1488 The URI to the primary artifact associated with this dataset (if 

1489 the dataset was disassembled within the datastore this may be 

1490 `None`), and the URIs to any components associated with the dataset 

1491 artifact. (can be empty if there are no components). 

1492 """ 

1493 ref = self._findDatasetRef( 

1494 datasetRefOrType, dataId, allowUnresolved=predict, collections=collections, **kwargs 

1495 ) 

1496 if ref.id is None: # only possible if predict is True 

1497 if run is None: 

1498 run = self.run 

1499 if run is None: 

1500 raise TypeError("Cannot predict location with run=None.") 

1501 # Lie about ID, because we can't guess it, and only 

1502 # Datastore.getURIs() will ever see it (and it doesn't use it). 

1503 with warnings.catch_warnings(): 

1504 warnings.simplefilter("ignore", category=UnresolvedRefWarning) 

1505 ref = ref.resolved(id=uuid.UUID("{00000000-0000-0000-0000-000000000000}"), run=run) 

1506 return self.datastore.getURIs(ref, predict) 

1507 

1508 def getURI( 

1509 self, 

1510 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1511 /, 

1512 dataId: Optional[DataId] = None, 

1513 *, 

1514 predict: bool = False, 

1515 collections: Any = None, 

1516 run: Optional[str] = None, 

1517 **kwargs: Any, 

1518 ) -> ResourcePath: 

1519 """Return the URI to the Dataset. 

1520 

1521 Parameters 

1522 ---------- 

1523 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1524 When `DatasetRef` the `dataId` should be `None`. 

1525 Otherwise the `DatasetType` or name thereof. 

1526 dataId : `dict` or `DataCoordinate` 

1527 A `dict` of `Dimension` link name, value pairs that label the 

1528 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1529 should be provided as the first argument. 

1530 predict : `bool` 

1531 If `True`, allow URIs to be returned of datasets that have not 

1532 been written. 

1533 collections : Any, optional 

1534 Collections to be searched, overriding ``self.collections``. 

1535 Can be any of the types supported by the ``collections`` argument 

1536 to butler construction. 

1537 run : `str`, optional 

1538 Run to use for predictions, overriding ``self.run``. 

1539 **kwargs 

1540 Additional keyword arguments used to augment or construct a 

1541 `DataCoordinate`. See `DataCoordinate.standardize` 

1542 parameters. 

1543 

1544 Returns 

1545 ------- 

1546 uri : `lsst.resources.ResourcePath` 

1547 URI pointing to the Dataset within the datastore. If the 

1548 Dataset does not exist in the datastore, and if ``predict`` is 

1549 `True`, the URI will be a prediction and will include a URI 

1550 fragment "#predicted". 

1551 If the datastore does not have entities that relate well 

1552 to the concept of a URI the returned URI string will be 

1553 descriptive. The returned URI is not guaranteed to be obtainable. 

1554 

1555 Raises 

1556 ------ 

1557 LookupError 

1558 A URI has been requested for a dataset that does not exist and 

1559 guessing is not allowed. 

1560 ValueError 

1561 Raised if a resolved `DatasetRef` was passed as an input, but it 

1562 differs from the one found in the registry. 

1563 TypeError 

1564 Raised if no collections were provided. 

1565 RuntimeError 

1566 Raised if a URI is requested for a dataset that consists of 

1567 multiple artifacts. 

1568 """ 

1569 primary, components = self.getURIs( 

1570 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1571 ) 

1572 

1573 if primary is None or components: 

1574 raise RuntimeError( 

1575 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1576 "Use Butler.getURIs() instead." 

1577 ) 

1578 return primary 

1579 

1580 def retrieveArtifacts( 

1581 self, 

1582 refs: Iterable[DatasetRef], 

1583 destination: ResourcePathExpression, 

1584 transfer: str = "auto", 

1585 preserve_path: bool = True, 

1586 overwrite: bool = False, 

1587 ) -> List[ResourcePath]: 

1588 """Retrieve the artifacts associated with the supplied refs. 

1589 

1590 Parameters 

1591 ---------- 

1592 refs : iterable of `DatasetRef` 

1593 The datasets for which artifacts are to be retrieved. 

1594 A single ref can result in multiple artifacts. The refs must 

1595 be resolved. 

1596 destination : `lsst.resources.ResourcePath` or `str` 

1597 Location to write the artifacts. 

1598 transfer : `str`, optional 

1599 Method to use to transfer the artifacts. Must be one of the options 

1600 supported by `~lsst.resources.ResourcePath.transfer_from()`. 

1601 "move" is not allowed. 

1602 preserve_path : `bool`, optional 

1603 If `True` the full path of the artifact within the datastore 

1604 is preserved. If `False` the final file component of the path 

1605 is used. 

1606 overwrite : `bool`, optional 

1607 If `True` allow transfers to overwrite existing files at the 

1608 destination. 

1609 

1610 Returns 

1611 ------- 

1612 targets : `list` of `lsst.resources.ResourcePath` 

1613 URIs of file artifacts in destination location. Order is not 

1614 preserved. 

1615 

1616 Notes 

1617 ----- 

1618 For non-file datastores the artifacts written to the destination 

1619 may not match the representation inside the datastore. For example 

1620 a hierarchical data structure in a NoSQL database may well be stored 

1621 as a JSON file. 

1622 """ 

1623 return self.datastore.retrieveArtifacts( 

1624 refs, 

1625 ResourcePath(destination), 

1626 transfer=transfer, 

1627 preserve_path=preserve_path, 

1628 overwrite=overwrite, 

1629 ) 

1630 

1631 def datasetExists( 

1632 self, 

1633 datasetRefOrType: Union[DatasetRef, DatasetType, str], 

1634 dataId: Optional[DataId] = None, 

1635 *, 

1636 collections: Any = None, 

1637 **kwargs: Any, 

1638 ) -> bool: 

1639 """Return True if the Dataset is actually present in the Datastore. 

1640 

1641 Parameters 

1642 ---------- 

1643 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1644 When `DatasetRef` the `dataId` should be `None`. 

1645 Otherwise the `DatasetType` or name thereof. 

1646 dataId : `dict` or `DataCoordinate` 

1647 A `dict` of `Dimension` link name, value pairs that label the 

1648 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1649 should be provided as the first argument. 

1650 collections : Any, optional 

1651 Collections to be searched, overriding ``self.collections``. 

1652 Can be any of the types supported by the ``collections`` argument 

1653 to butler construction. 

1654 **kwargs 

1655 Additional keyword arguments used to augment or construct a 

1656 `DataCoordinate`. See `DataCoordinate.standardize` 

1657 parameters. 

1658 

1659 Raises 

1660 ------ 

1661 LookupError 

1662 Raised if the dataset is not even present in the Registry. 

1663 ValueError 

1664 Raised if a resolved `DatasetRef` was passed as an input, but it 

1665 differs from the one found in the registry. 

1666 TypeError 

1667 Raised if no collections were provided. 

1668 """ 

1669 # A resolved ref may be given that is not known to this butler. 

1670 if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

1671 ref = self.registry.getDataset(datasetRefOrType.id) 

1672 if ref is None: 

1673 raise LookupError( 

1674 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1675 ) 

1676 else: 

1677 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1678 return self.datastore.exists(ref) 

1679 

1680 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1681 """Remove one or more `~CollectionType.RUN` collections and the 

1682 datasets within them. 

1683 

1684 Parameters 

1685 ---------- 

1686 names : `Iterable` [ `str` ] 

1687 The names of the collections to remove. 

1688 unstore : `bool`, optional 

1689 If `True` (default), delete datasets from all datastores in which 

1690 they are present, and attempt to rollback the registry deletions if 

1691 datastore deletions fail (which may not always be possible). If 

1692 `False`, datastore records for these datasets are still removed, 

1693 but any artifacts (e.g. files) will not be. 

1694 

1695 Raises 

1696 ------ 

1697 TypeError 

1698 Raised if one or more collections are not of type 

1699 `~CollectionType.RUN`. 

1700 """ 

1701 if not self.isWriteable(): 

1702 raise TypeError("Butler is read-only.") 

1703 names = list(names) 

1704 refs: List[DatasetRef] = [] 

1705 for name in names: 

1706 collectionType = self.registry.getCollectionType(name) 

1707 if collectionType is not CollectionType.RUN: 

1708 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1709 refs.extend(self.registry.queryDatasets(..., collections=name, findFirst=True)) 

1710 with self.datastore.transaction(): 

1711 with self.registry.transaction(): 

1712 if unstore: 

1713 self.datastore.trash(refs) 

1714 else: 

1715 self.datastore.forget(refs) 

1716 for name in names: 

1717 self.registry.removeCollection(name) 

1718 if unstore: 

1719 # Point of no return for removing artifacts 

1720 self.datastore.emptyTrash() 

1721 

1722 def pruneDatasets( 

1723 self, 

1724 refs: Iterable[DatasetRef], 

1725 *, 

1726 disassociate: bool = True, 

1727 unstore: bool = False, 

1728 tags: Iterable[str] = (), 

1729 purge: bool = False, 

1730 ) -> None: 

1731 # docstring inherited from LimitedButler 

1732 

1733 if not self.isWriteable(): 

1734 raise TypeError("Butler is read-only.") 

1735 if purge: 

1736 if not disassociate: 

1737 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1738 if not unstore: 

1739 raise TypeError("Cannot pass purge=True without unstore=True.") 

1740 elif disassociate: 

1741 tags = tuple(tags) 

1742 if not tags: 

1743 raise TypeError("No tags provided but disassociate=True.") 

1744 for tag in tags: 

1745 collectionType = self.registry.getCollectionType(tag) 

1746 if collectionType is not CollectionType.TAGGED: 

1747 raise TypeError( 

1748 f"Cannot disassociate from collection '{tag}' " 

1749 f"of non-TAGGED type {collectionType.name}." 

1750 ) 

1751 # For an execution butler we want to keep existing UUIDs for the 

1752 # datasets, for that we need to keep them in the collections but 

1753 # remove from datastore. 

1754 if self._allow_put_of_predefined_dataset and purge: 

1755 purge = False 

1756 disassociate = False 

1757 # Transform possibly-single-pass iterable into something we can iterate 

1758 # over multiple times. 

1759 refs = list(refs) 

1760 # Pruning a component of a DatasetRef makes no sense since registry 

1761 # doesn't know about components and datastore might not store 

1762 # components in a separate file 

1763 for ref in refs: 

1764 if ref.datasetType.component(): 

1765 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1766 # We don't need an unreliable Datastore transaction for this, because 

1767 # we've been extra careful to ensure that Datastore.trash only involves 

1768 # mutating the Registry (it can _look_ at Datastore-specific things, 

1769 # but shouldn't change them), and hence all operations here are 

1770 # Registry operations. 

1771 with self.datastore.transaction(): 

1772 with self.registry.transaction(): 

1773 if unstore: 

1774 self.datastore.trash(refs) 

1775 if purge: 

1776 self.registry.removeDatasets(refs) 

1777 elif disassociate: 

1778 assert tags, "Guaranteed by earlier logic in this function." 

1779 for tag in tags: 

1780 self.registry.disassociate(tag, refs) 

1781 # We've exited the Registry transaction, and apparently committed. 

1782 # (if there was an exception, everything rolled back, and it's as if 

1783 # nothing happened - and we never get here). 

1784 # Datastore artifacts are not yet gone, but they're clearly marked 

1785 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1786 # problems we can try again later, and if manual administrative 

1787 # intervention is required, it's pretty clear what that should entail: 

1788 # deleting everything on disk and in private Datastore tables that is 

1789 # in the dataset_location_trash table. 

1790 if unstore: 

1791 # Point of no return for removing artifacts 

1792 self.datastore.emptyTrash() 

1793 

1794 @transactional 

1795 def ingest( 

1796 self, 

1797 *datasets: FileDataset, 

1798 transfer: Optional[str] = "auto", 

1799 run: Optional[str] = None, 

1800 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

1801 record_validation_info: bool = True, 

1802 ) -> None: 

1803 """Store and register one or more datasets that already exist on disk. 

1804 

1805 Parameters 

1806 ---------- 

1807 datasets : `FileDataset` 

1808 Each positional argument is a struct containing information about 

1809 a file to be ingested, including its URI (either absolute or 

1810 relative to the datastore root, if applicable), a `DatasetRef`, 

1811 and optionally a formatter class or its fully-qualified string 

1812 name. If a formatter is not provided, the formatter that would be 

1813 used for `put` is assumed. On successful return, all 

1814 `FileDataset.ref` attributes will have their `DatasetRef.id` 

1815 attribute populated and all `FileDataset.formatter` attributes will 

1816 be set to the formatter class used. `FileDataset.path` attributes 

1817 may be modified to put paths in whatever the datastore considers a 

1818 standardized form. 

1819 transfer : `str`, optional 

1820 If not `None`, must be one of 'auto', 'move', 'copy', 'direct', 

1821 'split', 'hardlink', 'relsymlink' or 'symlink', indicating how to 

1822 transfer the file. 

1823 run : `str`, optional 

1824 The name of the run ingested datasets should be added to, 

1825 overriding ``self.run``. 

1826 idGenerationMode : `DatasetIdGenEnum`, optional 

1827 Specifies option for generating dataset IDs. By default unique IDs 

1828 are generated for each inserted dataset. 

1829 record_validation_info : `bool`, optional 

1830 If `True`, the default, the datastore can record validation 

1831 information associated with the file. If `False` the datastore 

1832 will not attempt to track any information such as checksums 

1833 or file sizes. This can be useful if such information is tracked 

1834 in an external system or if the file is to be compressed in place. 

1835 It is up to the datastore whether this parameter is relevant. 

1836 

1837 Raises 

1838 ------ 

1839 TypeError 

1840 Raised if the butler is read-only or if no run was provided. 

1841 NotImplementedError 

1842 Raised if the `Datastore` does not support the given transfer mode. 

1843 DatasetTypeNotSupportedError 

1844 Raised if one or more files to be ingested have a dataset type that 

1845 is not supported by the `Datastore`.. 

1846 FileNotFoundError 

1847 Raised if one of the given files does not exist. 

1848 FileExistsError 

1849 Raised if transfer is not `None` but the (internal) location the 

1850 file would be moved to is already occupied. 

1851 

1852 Notes 

1853 ----- 

1854 This operation is not fully exception safe: if a database operation 

1855 fails, the given `FileDataset` instances may be only partially updated. 

1856 

1857 It is atomic in terms of database operations (they will either all 

1858 succeed or all fail) providing the database engine implements 

1859 transactions correctly. It will attempt to be atomic in terms of 

1860 filesystem operations as well, but this cannot be implemented 

1861 rigorously for most datastores. 

1862 """ 

1863 if not self.isWriteable(): 

1864 raise TypeError("Butler is read-only.") 

1865 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1866 # Reorganize the inputs so they're grouped by DatasetType and then 

1867 # data ID. We also include a list of DatasetRefs for each FileDataset 

1868 # to hold the resolved DatasetRefs returned by the Registry, before 

1869 # it's safe to swap them into FileDataset.refs. 

1870 # Some type annotation aliases to make that clearer: 

1871 GroupForType = Dict[DataCoordinate, Tuple[FileDataset, List[DatasetRef]]] 

1872 GroupedData = MutableMapping[DatasetType, GroupForType] 

1873 # The actual data structure: 

1874 groupedData: GroupedData = defaultdict(dict) 

1875 # And the nested loop that populates it: 

1876 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1877 # This list intentionally shared across the inner loop, since it's 

1878 # associated with `dataset`. 

1879 resolvedRefs: List[DatasetRef] = [] 

1880 

1881 # Somewhere to store pre-existing refs if we have an 

1882 # execution butler. 

1883 existingRefs: List[DatasetRef] = [] 

1884 

1885 for ref in dataset.refs: 

1886 if ref.dataId in groupedData[ref.datasetType]: 

1887 raise ConflictingDefinitionError( 

1888 f"Ingest conflict. Dataset {dataset.path} has same" 

1889 " DataId as other ingest dataset" 

1890 f" {groupedData[ref.datasetType][ref.dataId][0].path} " 

1891 f" ({ref.dataId})" 

1892 ) 

1893 if self._allow_put_of_predefined_dataset: 

1894 existing_ref = self.registry.findDataset( 

1895 ref.datasetType, dataId=ref.dataId, collections=run 

1896 ) 

1897 if existing_ref: 

1898 if self.datastore.knows(existing_ref): 

1899 raise ConflictingDefinitionError( 

1900 f"Dataset associated with path {dataset.path}" 

1901 f" already exists as {existing_ref}." 

1902 ) 

1903 # Store this ref elsewhere since it already exists 

1904 # and we do not want to remake it but we do want 

1905 # to store it in the datastore. 

1906 existingRefs.append(existing_ref) 

1907 

1908 # Nothing else to do until we have finished 

1909 # iterating. 

1910 continue 

1911 

1912 groupedData[ref.datasetType][ref.dataId] = (dataset, resolvedRefs) 

1913 

1914 if existingRefs: 

1915 if len(dataset.refs) != len(existingRefs): 

1916 # Keeping track of partially pre-existing datasets is hard 

1917 # and should generally never happen. For now don't allow 

1918 # it. 

1919 raise ConflictingDefinitionError( 

1920 f"For dataset {dataset.path} some dataIds already exist" 

1921 " in registry but others do not. This is not supported." 

1922 ) 

1923 

1924 # Attach the resolved refs if we found them. 

1925 dataset.refs = existingRefs 

1926 

1927 # Now we can bulk-insert into Registry for each DatasetType. 

1928 for datasetType, groupForType in progress.iter_item_chunks( 

1929 groupedData.items(), desc="Bulk-inserting datasets by type" 

1930 ): 

1931 refs = self.registry.insertDatasets( 

1932 datasetType, 

1933 dataIds=groupForType.keys(), 

1934 run=run, 

1935 expand=self.datastore.needs_expanded_data_ids(transfer, datasetType), 

1936 idGenerationMode=idGenerationMode, 

1937 ) 

1938 # Append those resolved DatasetRefs to the new lists we set up for 

1939 # them. 

1940 for ref, (_, resolvedRefs) in zip(refs, groupForType.values()): 

1941 resolvedRefs.append(ref) 

1942 

1943 # Go back to the original FileDatasets to replace their refs with the 

1944 # new resolved ones. 

1945 for groupForType in progress.iter_chunks( 

1946 groupedData.values(), desc="Reassociating resolved dataset refs with files" 

1947 ): 

1948 for dataset, resolvedRefs in groupForType.values(): 

1949 dataset.refs = resolvedRefs 

1950 

1951 # Bulk-insert everything into Datastore. 

1952 self.datastore.ingest(*datasets, transfer=transfer, record_validation_info=record_validation_info) 

1953 

1954 @contextlib.contextmanager 

1955 def export( 

1956 self, 

1957 *, 

1958 directory: Optional[str] = None, 

1959 filename: Optional[str] = None, 

1960 format: Optional[str] = None, 

1961 transfer: Optional[str] = None, 

1962 ) -> Iterator[RepoExportContext]: 

1963 """Export datasets from the repository represented by this `Butler`. 

1964 

1965 This method is a context manager that returns a helper object 

1966 (`RepoExportContext`) that is used to indicate what information from 

1967 the repository should be exported. 

1968 

1969 Parameters 

1970 ---------- 

1971 directory : `str`, optional 

1972 Directory dataset files should be written to if ``transfer`` is not 

1973 `None`. 

1974 filename : `str`, optional 

1975 Name for the file that will include database information associated 

1976 with the exported datasets. If this is not an absolute path and 

1977 ``directory`` is not `None`, it will be written to ``directory`` 

1978 instead of the current working directory. Defaults to 

1979 "export.{format}". 

1980 format : `str`, optional 

1981 File format for the database information file. If `None`, the 

1982 extension of ``filename`` will be used. 

1983 transfer : `str`, optional 

1984 Transfer mode passed to `Datastore.export`. 

1985 

1986 Raises 

1987 ------ 

1988 TypeError 

1989 Raised if the set of arguments passed is inconsistent. 

1990 

1991 Examples 

1992 -------- 

1993 Typically the `Registry.queryDataIds` and `Registry.queryDatasets` 

1994 methods are used to provide the iterables over data IDs and/or datasets 

1995 to be exported:: 

1996 

1997 with butler.export("exports.yaml") as export: 

1998 # Export all flats, but none of the dimension element rows 

1999 # (i.e. data ID information) associated with them. 

2000 export.saveDatasets(butler.registry.queryDatasets("flat"), 

2001 elements=()) 

2002 # Export all datasets that start with "deepCoadd_" and all of 

2003 # their associated data ID information. 

2004 export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) 

2005 """ 

2006 if directory is None and transfer is not None: 

2007 raise TypeError("Cannot transfer without providing a directory.") 

2008 if transfer == "move": 

2009 raise TypeError("Transfer may not be 'move': export is read-only") 

2010 if format is None: 

2011 if filename is None: 

2012 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2013 else: 

2014 _, format = os.path.splitext(filename) 

2015 if not format: 

2016 raise ValueError("Please specify a file extension to determine export format.") 

2017 format = format[1:] # Strip leading "."" 

2018 elif filename is None: 

2019 filename = f"export.{format}" 

2020 if directory is not None: 

2021 filename = os.path.join(directory, filename) 

2022 formats = self._config["repo_transfer_formats"] 

2023 if format not in formats: 

2024 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

2025 BackendClass = get_class_of(formats[format, "export"]) 

2026 with open(filename, "w") as stream: 

2027 backend = BackendClass(stream, universe=self.registry.dimensions) 

2028 try: 

2029 helper = RepoExportContext( 

2030 self.registry, self.datastore, backend=backend, directory=directory, transfer=transfer 

2031 ) 

2032 yield helper 

2033 except BaseException: 

2034 raise 

2035 else: 

2036 helper._finish() 

2037 

2038 def import_( 

2039 self, 

2040 *, 

2041 directory: Optional[ResourcePathExpression] = None, 

2042 filename: Union[ResourcePathExpression, TextIO, None] = None, 

2043 format: Optional[str] = None, 

2044 transfer: Optional[str] = None, 

2045 skip_dimensions: Optional[Set] = None, 

2046 idGenerationMode: DatasetIdGenEnum = DatasetIdGenEnum.UNIQUE, 

2047 reuseIds: bool = False, 

2048 ) -> None: 

2049 """Import datasets into this repository that were exported from a 

2050 different butler repository via `~lsst.daf.butler.Butler.export`. 

2051 

2052 Parameters 

2053 ---------- 

2054 directory : `~lsst.resources.ResourcePathExpression`, optional 

2055 Directory containing dataset files to import from. If `None`, 

2056 ``filename`` and all dataset file paths specified therein must 

2057 be absolute. 

2058 filename : `~lsst.resources.ResourcePathExpression` or `TextIO` 

2059 A stream or name of file that contains database information 

2060 associated with the exported datasets, typically generated by 

2061 `~lsst.daf.butler.Butler.export`. If this a string (name) or 

2062 `~lsst.resources.ResourcePath` and is not an absolute path, 

2063 it will first be looked for relative to ``directory`` and if not 

2064 found there it will be looked for in the current working 

2065 directory. Defaults to "export.{format}". 

2066 format : `str`, optional 

2067 File format for ``filename``. If `None`, the extension of 

2068 ``filename`` will be used. 

2069 transfer : `str`, optional 

2070 Transfer mode passed to `~lsst.daf.butler.Datastore.ingest`. 

2071 skip_dimensions : `set`, optional 

2072 Names of dimensions that should be skipped and not imported. 

2073 idGenerationMode : `DatasetIdGenEnum`, optional 

2074 Specifies option for generating dataset IDs when IDs are not 

2075 provided or their type does not match backend type. By default 

2076 unique IDs are generated for each inserted dataset. 

2077 reuseIds : `bool`, optional 

2078 If `True` then forces re-use of imported dataset IDs for integer 

2079 IDs which are normally generated as auto-incremented; exception 

2080 will be raised if imported IDs clash with existing ones. This 

2081 option has no effect on the use of globally-unique IDs which are 

2082 always re-used (or generated if integer IDs are being imported). 

2083 

2084 Raises 

2085 ------ 

2086 TypeError 

2087 Raised if the set of arguments passed is inconsistent, or if the 

2088 butler is read-only. 

2089 """ 

2090 if not self.isWriteable(): 

2091 raise TypeError("Butler is read-only.") 

2092 if format is None: 

2093 if filename is None: 

2094 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

2095 else: 

2096 _, format = os.path.splitext(filename) # type: ignore 

2097 elif filename is None: 

2098 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

2099 if directory is not None: 

2100 directory = ResourcePath(directory, forceDirectory=True) 

2101 # mypy doesn't think this will work but it does in python >= 3.10. 

2102 if isinstance(filename, ResourcePathExpression): # type: ignore 

2103 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

2104 if not filename.isabs() and directory is not None: 

2105 potential = directory.join(filename) 

2106 exists_in_cwd = filename.exists() 

2107 exists_in_dir = potential.exists() 

2108 if exists_in_cwd and exists_in_dir: 

2109 log.warning( 

2110 "A relative path for filename was specified (%s) which exists relative to cwd. " 

2111 "Additionally, the file exists relative to the given search directory (%s). " 

2112 "Using the export file in the given directory.", 

2113 filename, 

2114 potential, 

2115 ) 

2116 # Given they specified an explicit directory and that 

2117 # directory has the export file in it, assume that that 

2118 # is what was meant despite the file in cwd. 

2119 filename = potential 

2120 elif exists_in_dir: 

2121 filename = potential 

2122 elif not exists_in_cwd and not exists_in_dir: 

2123 # Raise early. 

2124 raise FileNotFoundError( 

2125 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

2126 ) 

2127 BackendClass = get_class_of(self._config["repo_transfer_formats"][format]["import"]) 

2128 

2129 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

2130 backend = BackendClass(importStream, self.registry) 

2131 backend.register() 

2132 with self.transaction(): 

2133 backend.load( 

2134 self.datastore, 

2135 directory=directory, 

2136 transfer=transfer, 

2137 skip_dimensions=skip_dimensions, 

2138 idGenerationMode=idGenerationMode, 

2139 reuseIds=reuseIds, 

2140 ) 

2141 

2142 if isinstance(filename, ResourcePath): 

2143 # We can not use open() here at the moment because of 

2144 # DM-38589 since yaml does stream.read(8192) in a loop. 

2145 stream = io.StringIO(filename.read().decode()) 

2146 doImport(stream) 

2147 else: 

2148 doImport(filename) # type: ignore 

2149 

2150 def transfer_from( 

2151 self, 

2152 source_butler: LimitedButler, 

2153 source_refs: Iterable[DatasetRef], 

2154 transfer: str = "auto", 

2155 skip_missing: bool = True, 

2156 register_dataset_types: bool = False, 

2157 transfer_dimensions: bool = False, 

2158 ) -> collections.abc.Collection[DatasetRef]: 

2159 """Transfer datasets to this Butler from a run in another Butler. 

2160 

2161 Parameters 

2162 ---------- 

2163 source_butler : `LimitedButler` 

2164 Butler from which the datasets are to be transferred. If data IDs 

2165 in ``source_refs`` are not expanded then this has to be a full 

2166 `Butler` whose registry will be used to expand data IDs. 

2167 source_refs : iterable of `DatasetRef` 

2168 Datasets defined in the source butler that should be transferred to 

2169 this butler. 

2170 transfer : `str`, optional 

2171 Transfer mode passed to `~lsst.daf.butler.Datastore.transfer_from`. 

2172 skip_missing : `bool` 

2173 If `True`, datasets with no datastore artifact associated with 

2174 them are not transferred. If `False` a registry entry will be 

2175 created even if no datastore record is created (and so will 

2176 look equivalent to the dataset being unstored). 

2177 register_dataset_types : `bool` 

2178 If `True` any missing dataset types are registered. Otherwise 

2179 an exception is raised. 

2180 transfer_dimensions : `bool`, optional 

2181 If `True`, dimension record data associated with the new datasets 

2182 will be transferred. 

2183 

2184 Returns 

2185 ------- 

2186 refs : `list` of `DatasetRef` 

2187 The refs added to this Butler. 

2188 

2189 Notes 

2190 ----- 

2191 The datastore artifact has to exist for a transfer 

2192 to be made but non-existence is not an error. 

2193 

2194 Datasets that already exist in this run will be skipped. 

2195 

2196 The datasets are imported as part of a transaction, although 

2197 dataset types are registered before the transaction is started. 

2198 This means that it is possible for a dataset type to be registered 

2199 even though transfer has failed. 

2200 """ 

2201 if not self.isWriteable(): 

2202 raise TypeError("Butler is read-only.") 

2203 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

2204 

2205 # Will iterate through the refs multiple times so need to convert 

2206 # to a list if this isn't a collection. 

2207 if not isinstance(source_refs, collections.abc.Collection): 

2208 source_refs = list(source_refs) 

2209 

2210 original_count = len(source_refs) 

2211 log.info("Transferring %d datasets into %s", original_count, str(self)) 

2212 

2213 # In some situations the datastore artifact may be missing 

2214 # and we do not want that registry entry to be imported. 

2215 # Asking datastore is not sufficient, the records may have been 

2216 # purged, we have to ask for the (predicted) URI and check 

2217 # existence explicitly. Execution butler is set up exactly like 

2218 # this with no datastore records. 

2219 artifact_existence: Dict[ResourcePath, bool] = {} 

2220 if skip_missing: 

2221 dataset_existence = source_butler.datastore.mexists( 

2222 source_refs, artifact_existence=artifact_existence 

2223 ) 

2224 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

2225 filtered_count = len(source_refs) 

2226 n_missing = original_count - filtered_count 

2227 log.verbose( 

2228 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

2229 n_missing, 

2230 "" if n_missing == 1 else "s", 

2231 filtered_count, 

2232 ) 

2233 

2234 # Importing requires that we group the refs by dataset type and run 

2235 # before doing the import. 

2236 source_dataset_types = set() 

2237 grouped_refs = defaultdict(list) 

2238 for ref in source_refs: 

2239 grouped_refs[ref.datasetType, ref.run].append(ref) 

2240 source_dataset_types.add(ref.datasetType) 

2241 

2242 # Check to see if the dataset type in the source butler has 

2243 # the same definition in the target butler and register missing 

2244 # ones if requested. Registration must happen outside a transaction. 

2245 newly_registered_dataset_types = set() 

2246 for datasetType in source_dataset_types: 

2247 if register_dataset_types: 

2248 # Let this raise immediately if inconsistent. Continuing 

2249 # on to find additional inconsistent dataset types 

2250 # might result in additional unwanted dataset types being 

2251 # registered. 

2252 if self.registry.registerDatasetType(datasetType): 

2253 newly_registered_dataset_types.add(datasetType) 

2254 else: 

2255 # If the dataset type is missing, let it fail immediately. 

2256 target_dataset_type = self.registry.getDatasetType(datasetType.name) 

2257 if target_dataset_type != datasetType: 

2258 raise ConflictingDefinitionError( 

2259 "Source butler dataset type differs from definition" 

2260 f" in target butler: {datasetType} !=" 

2261 f" {target_dataset_type}" 

2262 ) 

2263 if newly_registered_dataset_types: 

2264 # We may have registered some even if there were inconsistencies 

2265 # but should let people know (or else remove them again). 

2266 log.log( 

2267 VERBOSE, 

2268 "Registered the following dataset types in the target Butler: %s", 

2269 ", ".join(d.name for d in newly_registered_dataset_types), 

2270 ) 

2271 else: 

2272 log.log(VERBOSE, "All required dataset types are known to the target Butler") 

2273 

2274 dimension_records: Dict[DimensionElement, Dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

2275 if transfer_dimensions: 

2276 # Collect all the dimension records for these refs. 

2277 # All dimensions are to be copied but the list of valid dimensions 

2278 # come from this butler's universe. 

2279 elements = frozenset( 

2280 element 

2281 for element in self.registry.dimensions.getStaticElements() 

2282 if element.hasTable() and element.viewOf is None 

2283 ) 

2284 dataIds = set(ref.dataId for ref in source_refs) 

2285 # This logic comes from saveDataIds. 

2286 for dataId in dataIds: 

2287 # Need an expanded record, if not expanded that we need a full 

2288 # butler with registry (allow mocks with registry too). 

2289 if not dataId.hasRecords(): 

2290 if registry := getattr(source_butler, "registry", None): 

2291 dataId = registry.expandDataId(dataId) 

2292 else: 

2293 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

2294 # If this butler doesn't know about a dimension in the source 

2295 # butler things will break later. 

2296 for record in dataId.records.values(): 

2297 if record is not None and record.definition in elements: 

2298 dimension_records[record.definition].setdefault(record.dataId, record) 

2299 

2300 handled_collections: Set[str] = set() 

2301 

2302 # Do all the importing in a single transaction. 

2303 with self.transaction(): 

2304 if dimension_records: 

2305 log.verbose("Ensuring that dimension records exist for transferred datasets.") 

2306 for element, r in dimension_records.items(): 

2307 records = [r[dataId] for dataId in r] 

2308 # Assume that if the record is already present that we can 

2309 # use it without having to check that the record metadata 

2310 # is consistent. 

2311 self.registry.insertDimensionData(element, *records, skip_existing=True) 

2312 

2313 n_imported = 0 

2314 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2315 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2316 ): 

2317 if run not in handled_collections: 

2318 # May need to create output collection. If source butler 

2319 # has a registry, ask for documentation string. 

2320 run_doc = None 

2321 if registry := getattr(source_butler, "registry", None): 

2322 run_doc = registry.getCollectionDocumentation(run) 

2323 registered = self.registry.registerRun(run, doc=run_doc) 

2324 handled_collections.add(run) 

2325 if registered: 

2326 log.log(VERBOSE, "Creating output run %s", run) 

2327 

2328 n_refs = len(refs_to_import) 

2329 log.verbose( 

2330 "Importing %d ref%s of dataset type %s into run %s", 

2331 n_refs, 

2332 "" if n_refs == 1 else "s", 

2333 datasetType.name, 

2334 run, 

2335 ) 

2336 

2337 # Assume we are using UUIDs and the source refs will match 

2338 # those imported. 

2339 imported_refs = self.registry._importDatasets(refs_to_import, expand=False) 

2340 assert set(imported_refs) == set(refs_to_import) 

2341 n_imported += len(imported_refs) 

2342 

2343 assert len(source_refs) == n_imported 

2344 log.verbose("Imported %d datasets into destination butler", n_imported) 

2345 

2346 # Ask the datastore to transfer. The datastore has to check that 

2347 # the source datastore is compatible with the target datastore. 

2348 accepted, rejected = self.datastore.transfer_from( 

2349 source_butler.datastore, 

2350 source_refs, 

2351 transfer=transfer, 

2352 artifact_existence=artifact_existence, 

2353 ) 

2354 if rejected: 

2355 # For now, accept the registry entries but not the files. 

2356 log.warning( 

2357 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2358 len(rejected), 

2359 len(accepted), 

2360 datasetType, 

2361 run, 

2362 ) 

2363 

2364 return source_refs 

2365 

2366 def validateConfiguration( 

2367 self, 

2368 logFailures: bool = False, 

2369 datasetTypeNames: Optional[Iterable[str]] = None, 

2370 ignore: Iterable[str] | None = None, 

2371 ) -> None: 

2372 """Validate butler configuration. 

2373 

2374 Checks that each `DatasetType` can be stored in the `Datastore`. 

2375 

2376 Parameters 

2377 ---------- 

2378 logFailures : `bool`, optional 

2379 If `True`, output a log message for every validation error 

2380 detected. 

2381 datasetTypeNames : iterable of `str`, optional 

2382 The `DatasetType` names that should be checked. This allows 

2383 only a subset to be selected. 

2384 ignore : iterable of `str`, optional 

2385 Names of DatasetTypes to skip over. This can be used to skip 

2386 known problems. If a named `DatasetType` corresponds to a 

2387 composite, all components of that `DatasetType` will also be 

2388 ignored. 

2389 

2390 Raises 

2391 ------ 

2392 ButlerValidationError 

2393 Raised if there is some inconsistency with how this Butler 

2394 is configured. 

2395 """ 

2396 if datasetTypeNames: 

2397 datasetTypes = [self.registry.getDatasetType(name) for name in datasetTypeNames] 

2398 else: 

2399 datasetTypes = list(self.registry.queryDatasetTypes()) 

2400 

2401 # filter out anything from the ignore list 

2402 if ignore: 

2403 ignore = set(ignore) 

2404 datasetTypes = [ 

2405 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2406 ] 

2407 else: 

2408 ignore = set() 

2409 

2410 # Find all the registered instruments 

2411 instruments = set(record.name for record in self.registry.queryDimensionRecords("instrument")) 

2412 

2413 # For each datasetType that has an instrument dimension, create 

2414 # a DatasetRef for each defined instrument 

2415 datasetRefs = [] 

2416 

2417 for datasetType in datasetTypes: 

2418 if "instrument" in datasetType.dimensions: 

2419 for instrument in instruments: 

2420 datasetRef = DatasetRef( 

2421 datasetType, 

2422 {"instrument": instrument}, # type: ignore 

2423 conform=False, 

2424 run="validate", 

2425 ) 

2426 datasetRefs.append(datasetRef) 

2427 

2428 entities: List[Union[DatasetType, DatasetRef]] = [] 

2429 entities.extend(datasetTypes) 

2430 entities.extend(datasetRefs) 

2431 

2432 datastoreErrorStr = None 

2433 try: 

2434 self.datastore.validateConfiguration(entities, logFailures=logFailures) 

2435 except ValidationError as e: 

2436 datastoreErrorStr = str(e) 

2437 

2438 # Also check that the LookupKeys used by the datastores match 

2439 # registry and storage class definitions 

2440 keys = self.datastore.getLookupKeys() 

2441 

2442 failedNames = set() 

2443 failedDataId = set() 

2444 for key in keys: 

2445 if key.name is not None: 

2446 if key.name in ignore: 

2447 continue 

2448 

2449 # skip if specific datasetType names were requested and this 

2450 # name does not match 

2451 if datasetTypeNames and key.name not in datasetTypeNames: 

2452 continue 

2453 

2454 # See if it is a StorageClass or a DatasetType 

2455 if key.name in self.storageClasses: 

2456 pass 

2457 else: 

2458 try: 

2459 self.registry.getDatasetType(key.name) 

2460 except KeyError: 

2461 if logFailures: 

2462 log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) 

2463 failedNames.add(key) 

2464 else: 

2465 # Dimensions are checked for consistency when the Butler 

2466 # is created and rendezvoused with a universe. 

2467 pass 

2468 

2469 # Check that the instrument is a valid instrument 

2470 # Currently only support instrument so check for that 

2471 if key.dataId: 

2472 dataIdKeys = set(key.dataId) 

2473 if set(["instrument"]) != dataIdKeys: 

2474 if logFailures: 

2475 log.critical("Key '%s' has unsupported DataId override", key) 

2476 failedDataId.add(key) 

2477 elif key.dataId["instrument"] not in instruments: 

2478 if logFailures: 

2479 log.critical("Key '%s' has unknown instrument", key) 

2480 failedDataId.add(key) 

2481 

2482 messages = [] 

2483 

2484 if datastoreErrorStr: 

2485 messages.append(datastoreErrorStr) 

2486 

2487 for failed, msg in ( 

2488 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2489 (failedDataId, "Keys with bad DataId entries: "), 

2490 ): 

2491 if failed: 

2492 msg += ", ".join(str(k) for k in failed) 

2493 messages.append(msg) 

2494 

2495 if messages: 

2496 raise ValidationError(";\n".join(messages)) 

2497 

2498 @property 

2499 def collections(self) -> Sequence[str]: 

2500 """The collections to search by default, in order 

2501 (`Sequence` [ `str` ]). 

2502 

2503 This is an alias for ``self.registry.defaults.collections``. It cannot 

2504 be set directly in isolation, but all defaults may be changed together 

2505 by assigning a new `RegistryDefaults` instance to 

2506 ``self.registry.defaults``. 

2507 """ 

2508 return self.registry.defaults.collections 

2509 

2510 @property 

2511 def run(self) -> Optional[str]: 

2512 """Name of the run this butler writes outputs to by default (`str` or 

2513 `None`). 

2514 

2515 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2516 directly in isolation, but all defaults may be changed together by 

2517 assigning a new `RegistryDefaults` instance to 

2518 ``self.registry.defaults``. 

2519 """ 

2520 return self.registry.defaults.run 

2521 

2522 @property 

2523 def dimensions(self) -> DimensionUniverse: 

2524 # Docstring inherited. 

2525 return self.registry.dimensions 

2526 

2527 registry: Registry 

2528 """The object that manages dataset metadata and relationships (`Registry`). 

2529 

2530 Most operations that don't involve reading or writing butler datasets are 

2531 accessible only via `Registry` methods. 

2532 """ 

2533 

2534 datastore: Datastore 

2535 """The object that manages actual dataset storage (`Datastore`). 

2536 

2537 Direct user access to the datastore should rarely be necessary; the primary 

2538 exception is the case where a `Datastore` implementation provides extra 

2539 functionality beyond what the base class defines. 

2540 """ 

2541 

2542 storageClasses: StorageClassFactory 

2543 """An object that maps known storage class names to objects that fully 

2544 describe them (`StorageClassFactory`). 

2545 """ 

2546 

2547 _allow_put_of_predefined_dataset: bool 

2548 """Allow a put to succeed even if there is already a registry entry for it 

2549 but not a datastore record. (`bool`)."""