Coverage for python/lsst/daf/butler/direct_butler.py: 10%

749 statements  

« prev     ^ index     » next       coverage.py v7.4.3, created at 2024-03-07 11:04 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import itertools 

41import logging 

42import numbers 

43import os 

44import warnings 

45from collections import Counter, defaultdict 

46from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast 

48 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils.introspection import get_class_of 

51from lsst.utils.logging import VERBOSE, getLogger 

52from sqlalchemy.exc import IntegrityError 

53 

54from ._butler import Butler 

55from ._butler_config import ButlerConfig 

56from ._butler_instance_options import ButlerInstanceOptions 

57from ._dataset_existence import DatasetExistence 

58from ._dataset_ref import DatasetRef 

59from ._dataset_type import DatasetType 

60from ._deferredDatasetHandle import DeferredDatasetHandle 

61from ._exceptions import ValidationError 

62from ._limited_butler import LimitedButler 

63from ._registry_shim import RegistryShim 

64from ._storage_class import StorageClass, StorageClassFactory 

65from ._timespan import Timespan 

66from .datastore import Datastore, NullDatastore 

67from .dimensions import DataCoordinate, Dimension 

68from .progress import Progress 

69from .queries import Query 

70from .registry import ( 

71 CollectionType, 

72 ConflictingDefinitionError, 

73 DataIdError, 

74 MissingDatasetTypeError, 

75 RegistryDefaults, 

76 _RegistryFactory, 

77) 

78from .registry.sql_registry import SqlRegistry 

79from .transfers import RepoExportContext 

80from .utils import transactional 

81 

82if TYPE_CHECKING: 

83 from lsst.resources import ResourceHandleProtocol 

84 

85 from ._dataset_ref import DatasetId 

86 from ._file_dataset import FileDataset 

87 from .datastore import DatasetRefURIs 

88 from .dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse 

89 from .registry import Registry 

90 from .transfers import RepoImportBackend 

91 

92_LOG = getLogger(__name__) 

93 

94 

95class ButlerValidationError(ValidationError): 

96 """There is a problem with the Butler configuration.""" 

97 

98 pass 

99 

100 

101class DirectButler(Butler): # numpydoc ignore=PR02 

102 """Main entry point for the data access system. 

103 

104 Parameters 

105 ---------- 

106 config : `ButlerConfig` 

107 The configuration for this Butler instance. 

108 registry : `SqlRegistry` 

109 The object that manages dataset metadata and relationships. 

110 datastore : Datastore 

111 The object that manages actual dataset storage. 

112 storageClasses : StorageClassFactory 

113 An object that maps known storage class names to objects that fully 

114 describe them. 

115 

116 Notes 

117 ----- 

118 Most users should call the top-level `Butler`.``from_config`` instead of 

119 using this constructor directly. 

120 """ 

121 

122 # This is __new__ instead of __init__ because we have to support 

123 # instantiation via the legacy constructor Butler.__new__(), which 

124 # reads the configuration and selects which subclass to instantiate. The 

125 # interaction between __new__ and __init__ is kind of wacky in Python. If 

126 # we were using __init__ here, __init__ would be called twice (once when 

127 # the DirectButler instance is constructed inside Butler.from_config(), and 

128 # a second time with the original arguments to Butler() when the instance 

129 # is returned from Butler.__new__() 

130 def __new__( 

131 cls, 

132 *, 

133 config: ButlerConfig, 

134 registry: SqlRegistry, 

135 datastore: Datastore, 

136 storageClasses: StorageClassFactory, 

137 ) -> DirectButler: 

138 self = cast(DirectButler, super().__new__(cls)) 

139 self._config = config 

140 self._registry = registry 

141 self._datastore = datastore 

142 self.storageClasses = storageClasses 

143 

144 # For execution butler the datastore needs a special 

145 # dependency-inversion trick. This is not used by regular butler, 

146 # but we do not have a way to distinguish regular butler from execution 

147 # butler. 

148 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

149 

150 self._registry_shim = RegistryShim(self) 

151 

152 return self 

153 

154 @classmethod 

155 def create_from_config( 

156 cls, 

157 config: ButlerConfig, 

158 *, 

159 options: ButlerInstanceOptions, 

160 without_datastore: bool = False, 

161 ) -> DirectButler: 

162 """Construct a Butler instance from a configuration file. 

163 

164 Parameters 

165 ---------- 

166 config : `ButlerConfig` 

167 The configuration for this Butler instance. 

168 options : `ButlerInstanceOptions` 

169 Default values and other settings for the Butler instance. 

170 without_datastore : `bool`, optional 

171 If `True` do not attach a datastore to this butler. Any attempts 

172 to use a datastore will fail. 

173 

174 Notes 

175 ----- 

176 Most users should call the top-level `Butler`.``from_config`` 

177 instead of using this function directly. 

178 """ 

179 if "run" in config or "collection" in config: 

180 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

181 

182 defaults = RegistryDefaults( 

183 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs 

184 ) 

185 try: 

186 butlerRoot = config.get("root", config.configDir) 

187 writeable = options.writeable 

188 if writeable is None: 

189 writeable = options.run is not None 

190 registry = _RegistryFactory(config).from_config( 

191 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

192 ) 

193 if without_datastore: 

194 datastore: Datastore = NullDatastore(None, None) 

195 else: 

196 datastore = Datastore.fromConfig( 

197 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

198 ) 

199 # TODO: Once datastore drops dependency on registry we can 

200 # construct datastore first and pass opaque tables to registry 

201 # constructor. 

202 registry.make_datastore_tables(datastore.get_opaque_table_definitions()) 

203 storageClasses = StorageClassFactory() 

204 storageClasses.addFromConfig(config) 

205 

206 return DirectButler( 

207 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses 

208 ) 

209 except Exception: 

210 # Failures here usually mean that configuration is incomplete, 

211 # just issue an error message which includes config file URI. 

212 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.") 

213 raise 

214 

215 def _clone( 

216 self, 

217 *, 

218 collections: Any = None, 

219 run: str | None = None, 

220 inferDefaults: bool = True, 

221 **kwargs: Any, 

222 ) -> DirectButler: 

223 # Docstring inherited 

224 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

225 registry = self._registry.copy(defaults) 

226 

227 return DirectButler( 

228 registry=registry, 

229 config=self._config, 

230 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()), 

231 storageClasses=self.storageClasses, 

232 ) 

233 

234 GENERATION: ClassVar[int] = 3 

235 """This is a Generation 3 Butler. 

236 

237 This attribute may be removed in the future, once the Generation 2 Butler 

238 interface has been fully retired; it should only be used in transitional 

239 code. 

240 """ 

241 

242 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

243 """Return DatasetType defined in registry given dataset type name.""" 

244 try: 

245 return self.get_dataset_type(name) 

246 except MissingDatasetTypeError: 

247 return None 

248 

249 @classmethod 

250 def _unpickle( 

251 cls, 

252 config: ButlerConfig, 

253 collections: tuple[str, ...] | None, 

254 run: str | None, 

255 defaultDataId: dict[str, str], 

256 writeable: bool, 

257 ) -> DirectButler: 

258 """Callable used to unpickle a Butler. 

259 

260 We prefer not to use ``Butler.__init__`` directly so we can force some 

261 of its many arguments to be keyword-only (note that ``__reduce__`` 

262 can only invoke callables with positional arguments). 

263 

264 Parameters 

265 ---------- 

266 config : `ButlerConfig` 

267 Butler configuration, already coerced into a true `ButlerConfig` 

268 instance (and hence after any search paths for overrides have been 

269 utilized). 

270 collections : `tuple` [ `str` ] 

271 Names of the default collections to read from. 

272 run : `str`, optional 

273 Name of the default `~CollectionType.RUN` collection to write to. 

274 defaultDataId : `dict` [ `str`, `str` ] 

275 Default data ID values. 

276 writeable : `bool` 

277 Whether the Butler should support write operations. 

278 

279 Returns 

280 ------- 

281 butler : `Butler` 

282 A new `Butler` instance. 

283 """ 

284 return cls.create_from_config( 

285 config=config, 

286 options=ButlerInstanceOptions( 

287 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId 

288 ), 

289 ) 

290 

291 def __reduce__(self) -> tuple: 

292 """Support pickling.""" 

293 return ( 

294 DirectButler._unpickle, 

295 ( 

296 self._config, 

297 self.collections, 

298 self.run, 

299 dict(self._registry.defaults.dataId.required), 

300 self._registry.isWriteable(), 

301 ), 

302 ) 

303 

304 def __str__(self) -> str: 

305 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

306 self.collections, self.run, self._datastore, self._registry 

307 ) 

308 

309 def isWriteable(self) -> bool: 

310 # Docstring inherited. 

311 return self._registry.isWriteable() 

312 

313 def _caching_context(self) -> contextlib.AbstractContextManager[None]: 

314 """Context manager that enables caching.""" 

315 return self._registry.caching_context() 

316 

317 @contextlib.contextmanager 

318 def transaction(self) -> Iterator[None]: 

319 """Context manager supporting `Butler` transactions. 

320 

321 Transactions can be nested. 

322 """ 

323 with self._registry.transaction(), self._datastore.transaction(): 

324 yield 

325 

326 def _standardizeArgs( 

327 self, 

328 datasetRefOrType: DatasetRef | DatasetType | str, 

329 dataId: DataId | None = None, 

330 for_put: bool = True, 

331 **kwargs: Any, 

332 ) -> tuple[DatasetType, DataId | None]: 

333 """Standardize the arguments passed to several Butler APIs. 

334 

335 Parameters 

336 ---------- 

337 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

338 When `DatasetRef` the `dataId` should be `None`. 

339 Otherwise the `DatasetType` or name thereof. 

340 dataId : `dict` or `DataCoordinate` 

341 A `dict` of `Dimension` link name, value pairs that label the 

342 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

343 should be provided as the second argument. 

344 for_put : `bool`, optional 

345 If `True` this call is invoked as part of a `Butler.put()`. 

346 Otherwise it is assumed to be part of a `Butler.get()`. This 

347 parameter is only relevant if there is dataset type 

348 inconsistency. 

349 **kwargs 

350 Additional keyword arguments used to augment or construct a 

351 `DataCoordinate`. See `DataCoordinate.standardize` 

352 parameters. 

353 

354 Returns 

355 ------- 

356 datasetType : `DatasetType` 

357 A `DatasetType` instance extracted from ``datasetRefOrType``. 

358 dataId : `dict` or `DataId`, optional 

359 Argument that can be used (along with ``kwargs``) to construct a 

360 `DataId`. 

361 

362 Notes 

363 ----- 

364 Butler APIs that conceptually need a DatasetRef also allow passing a 

365 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

366 keyword arguments that can be used to construct one) separately. This 

367 method accepts those arguments and always returns a true `DatasetType` 

368 and a `DataId` or `dict`. 

369 

370 Standardization of `dict` vs `DataId` is best handled by passing the 

371 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

372 generally similarly flexible. 

373 """ 

374 externalDatasetType: DatasetType | None = None 

375 internalDatasetType: DatasetType | None = None 

376 if isinstance(datasetRefOrType, DatasetRef): 

377 if dataId is not None or kwargs: 

378 raise ValueError("DatasetRef given, cannot use dataId as well") 

379 externalDatasetType = datasetRefOrType.datasetType 

380 dataId = datasetRefOrType.dataId 

381 else: 

382 # Don't check whether DataId is provided, because Registry APIs 

383 # can usually construct a better error message when it wasn't. 

384 if isinstance(datasetRefOrType, DatasetType): 

385 externalDatasetType = datasetRefOrType 

386 else: 

387 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

388 

389 # Check that they are self-consistent 

390 if externalDatasetType is not None: 

391 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

392 if externalDatasetType != internalDatasetType: 

393 # We can allow differences if they are compatible, depending 

394 # on whether this is a get or a put. A get requires that 

395 # the python type associated with the datastore can be 

396 # converted to the user type. A put requires that the user 

397 # supplied python type can be converted to the internal 

398 # type expected by registry. 

399 relevantDatasetType = internalDatasetType 

400 if for_put: 

401 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

402 else: 

403 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

404 relevantDatasetType = externalDatasetType 

405 if not is_compatible: 

406 raise ValueError( 

407 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

408 f"registry definition ({internalDatasetType})" 

409 ) 

410 # Override the internal definition. 

411 internalDatasetType = relevantDatasetType 

412 

413 assert internalDatasetType is not None 

414 return internalDatasetType, dataId 

415 

416 def _rewrite_data_id( 

417 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

418 ) -> tuple[DataId | None, dict[str, Any]]: 

419 """Rewrite a data ID taking into account dimension records. 

420 

421 Take a Data ID and keyword args and rewrite it if necessary to 

422 allow the user to specify dimension records rather than dimension 

423 primary values. 

424 

425 This allows a user to include a dataId dict with keys of 

426 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

427 the integer exposure ID. It also allows a string to be given 

428 for a dimension value rather than the integer ID if that is more 

429 convenient. For example, rather than having to specifying the 

430 detector with ``detector.full_name``, a string given for ``detector`` 

431 will be interpreted as the full name and converted to the integer 

432 value. 

433 

434 Keyword arguments can also use strings for dimensions like detector 

435 and exposure but python does not allow them to include ``.`` and 

436 so the ``exposure.day_obs`` syntax can not be used in a keyword 

437 argument. 

438 

439 Parameters 

440 ---------- 

441 dataId : `dict` or `DataCoordinate` 

442 A `dict` of `Dimension` link name, value pairs that will label the 

443 `DatasetRef` within a Collection. 

444 datasetType : `DatasetType` 

445 The dataset type associated with this dataId. Required to 

446 determine the relevant dimensions. 

447 **kwargs 

448 Additional keyword arguments used to augment or construct a 

449 `DataId`. See `DataId` parameters. 

450 

451 Returns 

452 ------- 

453 dataId : `dict` or `DataCoordinate` 

454 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

455 no keyword arguments, the original dataId will be returned 

456 unchanged. 

457 **kwargs : `dict` 

458 Any unused keyword arguments (would normally be empty dict). 

459 """ 

460 # Do nothing if we have a standalone DataCoordinate. 

461 if isinstance(dataId, DataCoordinate) and not kwargs: 

462 return dataId, kwargs 

463 

464 # Process dimension records that are using record information 

465 # rather than ids 

466 newDataId: dict[str, DataIdValue] = {} 

467 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

468 

469 # if all the dataId comes from keyword parameters we do not need 

470 # to do anything here because they can't be of the form 

471 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

472 if dataId: 

473 for k, v in dataId.items(): 

474 # If we have a Dimension we do not need to do anything 

475 # because it cannot be a compound key. 

476 if isinstance(k, str) and "." in k: 

477 # Someone is using a more human-readable dataId 

478 dimensionName, record = k.split(".", 1) 

479 byRecord[dimensionName][record] = v 

480 elif isinstance(k, Dimension): 

481 newDataId[k.name] = v 

482 else: 

483 newDataId[k] = v 

484 

485 # Go through the updated dataId and check the type in case someone is 

486 # using an alternate key. We have already filtered out the compound 

487 # keys dimensions.record format. 

488 not_dimensions = {} 

489 

490 # Will need to look in the dataId and the keyword arguments 

491 # and will remove them if they need to be fixed or are unrecognized. 

492 for dataIdDict in (newDataId, kwargs): 

493 # Use a list so we can adjust the dict safely in the loop 

494 for dimensionName in list(dataIdDict): 

495 value = dataIdDict[dimensionName] 

496 try: 

497 dimension = self.dimensions.dimensions[dimensionName] 

498 except KeyError: 

499 # This is not a real dimension 

500 not_dimensions[dimensionName] = value 

501 del dataIdDict[dimensionName] 

502 continue 

503 

504 # Convert an integral type to an explicit int to simplify 

505 # comparisons here 

506 if isinstance(value, numbers.Integral): 

507 value = int(value) 

508 

509 if not isinstance(value, dimension.primaryKey.getPythonType()): 

510 for alternate in dimension.alternateKeys: 

511 if isinstance(value, alternate.getPythonType()): 

512 byRecord[dimensionName][alternate.name] = value 

513 del dataIdDict[dimensionName] 

514 _LOG.debug( 

515 "Converting dimension %s to %s.%s=%s", 

516 dimensionName, 

517 dimensionName, 

518 alternate.name, 

519 value, 

520 ) 

521 break 

522 else: 

523 _LOG.warning( 

524 "Type mismatch found for value '%r' provided for dimension %s. " 

525 "Could not find matching alternative (primary key has type %s) " 

526 "so attempting to use as-is.", 

527 value, 

528 dimensionName, 

529 dimension.primaryKey.getPythonType(), 

530 ) 

531 

532 # By this point kwargs and newDataId should only include valid 

533 # dimensions. Merge kwargs in to the new dataId and log if there 

534 # are dimensions in both (rather than calling update). 

535 for k, v in kwargs.items(): 

536 if k in newDataId and newDataId[k] != v: 

537 _LOG.debug( 

538 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

539 ) 

540 newDataId[k] = v 

541 # No need to retain any values in kwargs now. 

542 kwargs = {} 

543 

544 # If we have some unrecognized dimensions we have to try to connect 

545 # them to records in other dimensions. This is made more complicated 

546 # by some dimensions having records with clashing names. A mitigation 

547 # is that we can tell by this point which dimensions are missing 

548 # for the DatasetType but this does not work for calibrations 

549 # where additional dimensions can be used to constrain the temporal 

550 # axis. 

551 if not_dimensions: 

552 # Search for all dimensions even if we have been given a value 

553 # explicitly. In some cases records are given as well as the 

554 # actually dimension and this should not be an error if they 

555 # match. 

556 mandatoryDimensions = datasetType.dimensions.names # - provided 

557 

558 candidateDimensions: set[str] = set() 

559 candidateDimensions.update(mandatoryDimensions) 

560 

561 # For calibrations we may well be needing temporal dimensions 

562 # so rather than always including all dimensions in the scan 

563 # restrict things a little. It is still possible for there 

564 # to be confusion over day_obs in visit vs exposure for example. 

565 # If we are not searching calibration collections things may 

566 # fail but they are going to fail anyway because of the 

567 # ambiguousness of the dataId... 

568 if datasetType.isCalibration(): 

569 for dim in self.dimensions.dimensions: 

570 if dim.temporal: 

571 candidateDimensions.add(str(dim)) 

572 

573 # Look up table for the first association with a dimension 

574 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

575 

576 # Keep track of whether an item is associated with multiple 

577 # dimensions. 

578 counter: Counter[str] = Counter() 

579 assigned: dict[str, set[str]] = defaultdict(set) 

580 

581 # Go through the missing dimensions and associate the 

582 # given names with records within those dimensions 

583 matched_dims = set() 

584 for dimensionName in candidateDimensions: 

585 dimension = self.dimensions.dimensions[dimensionName] 

586 fields = dimension.metadata.names | dimension.uniqueKeys.names 

587 for field in not_dimensions: 

588 if field in fields: 

589 guessedAssociation[dimensionName][field] = not_dimensions[field] 

590 counter[dimensionName] += 1 

591 assigned[field].add(dimensionName) 

592 matched_dims.add(field) 

593 

594 # Calculate the fields that matched nothing. 

595 never_found = set(not_dimensions) - matched_dims 

596 

597 if never_found: 

598 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

599 

600 # There is a chance we have allocated a single dataId item 

601 # to multiple dimensions. Need to decide which should be retained. 

602 # For now assume that the most popular alternative wins. 

603 # This means that day_obs with seq_num will result in 

604 # exposure.day_obs and not visit.day_obs 

605 # Also prefer an explicitly missing dimension over an inferred 

606 # temporal dimension. 

607 for fieldName, assignedDimensions in assigned.items(): 

608 if len(assignedDimensions) > 1: 

609 # Pick the most popular (preferring mandatory dimensions) 

610 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

611 if requiredButMissing: 

612 candidateDimensions = requiredButMissing 

613 else: 

614 candidateDimensions = assignedDimensions 

615 

616 # If this is a choice between visit and exposure and 

617 # neither was a required part of the dataset type, 

618 # (hence in this branch) always prefer exposure over 

619 # visit since exposures are always defined and visits 

620 # are defined from exposures. 

621 if candidateDimensions == {"exposure", "visit"}: 

622 candidateDimensions = {"exposure"} 

623 

624 # Select the relevant items and get a new restricted 

625 # counter. 

626 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

627 duplicatesCounter: Counter[str] = Counter() 

628 duplicatesCounter.update(theseCounts) 

629 

630 # Choose the most common. If they are equally common 

631 # we will pick the one that was found first. 

632 # Returns a list of tuples 

633 selected = duplicatesCounter.most_common(1)[0][0] 

634 

635 _LOG.debug( 

636 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

637 " Removed ambiguity by choosing dimension %s.", 

638 fieldName, 

639 ", ".join(assignedDimensions), 

640 selected, 

641 ) 

642 

643 for candidateDimension in assignedDimensions: 

644 if candidateDimension != selected: 

645 del guessedAssociation[candidateDimension][fieldName] 

646 

647 # Update the record look up dict with the new associations 

648 for dimensionName, values in guessedAssociation.items(): 

649 if values: # A dict might now be empty 

650 _LOG.debug( 

651 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

652 ) 

653 byRecord[dimensionName].update(values) 

654 

655 if byRecord: 

656 # Some record specifiers were found so we need to convert 

657 # them to the Id form 

658 for dimensionName, values in byRecord.items(): 

659 if dimensionName in newDataId: 

660 _LOG.debug( 

661 "DataId specified explicit %s dimension value of %s in addition to" 

662 " general record specifiers for it of %s. Ignoring record information.", 

663 dimensionName, 

664 newDataId[dimensionName], 

665 str(values), 

666 ) 

667 # Get the actual record and compare with these values. 

668 try: 

669 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

670 except DataIdError: 

671 raise ValueError( 

672 f"Could not find dimension '{dimensionName}'" 

673 f" with dataId {newDataId} as part of comparing with" 

674 f" record values {byRecord[dimensionName]}" 

675 ) from None 

676 if len(recs) == 1: 

677 errmsg: list[str] = [] 

678 for k, v in values.items(): 

679 if (recval := getattr(recs[0], k)) != v: 

680 errmsg.append(f"{k}({recval} != {v})") 

681 if errmsg: 

682 raise ValueError( 

683 f"Dimension {dimensionName} in dataId has explicit value" 

684 " inconsistent with records: " + ", ".join(errmsg) 

685 ) 

686 else: 

687 # Multiple matches for an explicit dimension 

688 # should never happen but let downstream complain. 

689 pass 

690 continue 

691 

692 # Build up a WHERE expression 

693 bind = dict(values.items()) 

694 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

695 

696 # Hopefully we get a single record that matches 

697 records = set( 

698 self._registry.queryDimensionRecords( 

699 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

700 ) 

701 ) 

702 

703 if len(records) != 1: 

704 if len(records) > 1: 

705 # visit can have an ambiguous answer without involving 

706 # visit_system. The default visit_system is defined 

707 # by the instrument. 

708 if ( 

709 dimensionName == "visit" 

710 and "visit_system_membership" in self.dimensions 

711 and "visit_system" in self.dimensions["instrument"].metadata 

712 ): 

713 instrument_records = list( 

714 self._registry.queryDimensionRecords( 

715 "instrument", 

716 dataId=newDataId, 

717 **kwargs, 

718 ) 

719 ) 

720 if len(instrument_records) == 1: 

721 visit_system = instrument_records[0].visit_system 

722 if visit_system is None: 

723 # Set to a value that will never match. 

724 visit_system = -1 

725 

726 # Look up each visit in the 

727 # visit_system_membership records. 

728 for rec in records: 

729 membership = list( 

730 self._registry.queryDimensionRecords( 

731 # Use bind to allow zero results. 

732 # This is a fully-specified query. 

733 "visit_system_membership", 

734 where="instrument = inst AND visit_system = system AND visit = v", 

735 bind=dict( 

736 inst=instrument_records[0].name, system=visit_system, v=rec.id 

737 ), 

738 ) 

739 ) 

740 if membership: 

741 # This record is the right answer. 

742 records = {rec} 

743 break 

744 

745 # The ambiguity may have been resolved so check again. 

746 if len(records) > 1: 

747 _LOG.debug( 

748 "Received %d records from constraints of %s", len(records), str(values) 

749 ) 

750 for r in records: 

751 _LOG.debug("- %s", str(r)) 

752 raise ValueError( 

753 f"DataId specification for dimension {dimensionName} is not" 

754 f" uniquely constrained to a single dataset by {values}." 

755 f" Got {len(records)} results." 

756 ) 

757 else: 

758 raise ValueError( 

759 f"DataId specification for dimension {dimensionName} matched no" 

760 f" records when constrained by {values}" 

761 ) 

762 

763 # Get the primary key from the real dimension object 

764 dimension = self.dimensions.dimensions[dimensionName] 

765 if not isinstance(dimension, Dimension): 

766 raise RuntimeError( 

767 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

768 ) 

769 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

770 

771 return newDataId, kwargs 

772 

773 def _findDatasetRef( 

774 self, 

775 datasetRefOrType: DatasetRef | DatasetType | str, 

776 dataId: DataId | None = None, 

777 *, 

778 collections: Any = None, 

779 predict: bool = False, 

780 run: str | None = None, 

781 datastore_records: bool = False, 

782 **kwargs: Any, 

783 ) -> DatasetRef: 

784 """Shared logic for methods that start with a search for a dataset in 

785 the registry. 

786 

787 Parameters 

788 ---------- 

789 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

790 When `DatasetRef` the `dataId` should be `None`. 

791 Otherwise the `DatasetType` or name thereof. 

792 dataId : `dict` or `DataCoordinate`, optional 

793 A `dict` of `Dimension` link name, value pairs that label the 

794 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

795 should be provided as the first argument. 

796 collections : Any, optional 

797 Collections to be searched, overriding ``self.collections``. 

798 Can be any of the types supported by the ``collections`` argument 

799 to butler construction. 

800 predict : `bool`, optional 

801 If `True`, return a newly created `DatasetRef` with a unique 

802 dataset ID if finding a reference in the `Registry` fails. 

803 Defaults to `False`. 

804 run : `str`, optional 

805 Run collection name to use for creating `DatasetRef` for predicted 

806 datasets. Only used if ``predict`` is `True`. 

807 datastore_records : `bool`, optional 

808 If `True` add datastore records to returned `DatasetRef`. 

809 **kwargs 

810 Additional keyword arguments used to augment or construct a 

811 `DataId`. See `DataId` parameters. 

812 

813 Returns 

814 ------- 

815 ref : `DatasetRef` 

816 A reference to the dataset identified by the given arguments. 

817 This can be the same dataset reference as given if it was 

818 resolved. 

819 

820 Raises 

821 ------ 

822 LookupError 

823 Raised if no matching dataset exists in the `Registry` (and 

824 ``predict`` is `False`). 

825 ValueError 

826 Raised if a resolved `DatasetRef` was passed as an input, but it 

827 differs from the one found in the registry. 

828 TypeError 

829 Raised if no collections were provided. 

830 """ 

831 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

832 if isinstance(datasetRefOrType, DatasetRef): 

833 if collections is not None: 

834 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

835 # May need to retrieve datastore records if requested. 

836 if datastore_records and datasetRefOrType._datastore_records is None: 

837 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

838 return datasetRefOrType 

839 timespan: Timespan | None = None 

840 

841 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

842 

843 if datasetType.isCalibration(): 

844 # Because this is a calibration dataset, first try to make a 

845 # standardize the data ID without restricting the dimensions to 

846 # those of the dataset type requested, because there may be extra 

847 # dimensions that provide temporal information for a validity-range 

848 # lookup. 

849 dataId = DataCoordinate.standardize( 

850 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

851 ) 

852 if dataId.dimensions.temporal: 

853 dataId = self._registry.expandDataId(dataId) 

854 timespan = dataId.timespan 

855 else: 

856 # Standardize the data ID to just the dimensions of the dataset 

857 # type instead of letting registry.findDataset do it, so we get the 

858 # result even if no dataset is found. 

859 dataId = DataCoordinate.standardize( 

860 dataId, 

861 dimensions=datasetType.dimensions, 

862 defaults=self._registry.defaults.dataId, 

863 **kwargs, 

864 ) 

865 # Always lookup the DatasetRef, even if one is given, to ensure it is 

866 # present in the current collection. 

867 ref = self.find_dataset( 

868 datasetType, 

869 dataId, 

870 collections=collections, 

871 timespan=timespan, 

872 datastore_records=datastore_records, 

873 ) 

874 if ref is None: 

875 if predict: 

876 if run is None: 

877 run = self.run 

878 if run is None: 

879 raise TypeError("Cannot predict dataset ID/location with run=None.") 

880 return DatasetRef(datasetType, dataId, run=run) 

881 else: 

882 if collections is None: 

883 collections = self._registry.defaults.collections 

884 raise LookupError( 

885 f"Dataset {datasetType.name} with data ID {dataId} " 

886 f"could not be found in collections {collections}." 

887 ) 

888 if datasetType != ref.datasetType: 

889 # If they differ it is because the user explicitly specified 

890 # a compatible dataset type to this call rather than using the 

891 # registry definition. The DatasetRef must therefore be recreated 

892 # using the user definition such that the expected type is 

893 # returned. 

894 ref = DatasetRef( 

895 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

896 ) 

897 

898 return ref 

899 

900 @transactional 

901 def put( 

902 self, 

903 obj: Any, 

904 datasetRefOrType: DatasetRef | DatasetType | str, 

905 /, 

906 dataId: DataId | None = None, 

907 *, 

908 run: str | None = None, 

909 **kwargs: Any, 

910 ) -> DatasetRef: 

911 """Store and register a dataset. 

912 

913 Parameters 

914 ---------- 

915 obj : `object` 

916 The dataset. 

917 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

918 When `DatasetRef` is provided, ``dataId`` should be `None`. 

919 Otherwise the `DatasetType` or name thereof. If a fully resolved 

920 `DatasetRef` is given the run and ID are used directly. 

921 dataId : `dict` or `DataCoordinate` 

922 A `dict` of `Dimension` link name, value pairs that label the 

923 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

924 should be provided as the second argument. 

925 run : `str`, optional 

926 The name of the run the dataset should be added to, overriding 

927 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

928 **kwargs 

929 Additional keyword arguments used to augment or construct a 

930 `DataCoordinate`. See `DataCoordinate.standardize` 

931 parameters. Not used if a resolve `DatasetRef` is provided. 

932 

933 Returns 

934 ------- 

935 ref : `DatasetRef` 

936 A reference to the stored dataset, updated with the correct id if 

937 given. 

938 

939 Raises 

940 ------ 

941 TypeError 

942 Raised if the butler is read-only or if no run has been provided. 

943 """ 

944 if isinstance(datasetRefOrType, DatasetRef): 

945 # This is a direct put of predefined DatasetRef. 

946 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

947 if run is not None: 

948 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

949 # If registry already has a dataset with the same dataset ID, 

950 # dataset type and DataId, then _importDatasets will do nothing and 

951 # just return an original ref. We have to raise in this case, there 

952 # is a datastore check below for that. 

953 self._registry._importDatasets([datasetRefOrType], expand=True) 

954 # Before trying to write to the datastore check that it does not 

955 # know this dataset. This is prone to races, of course. 

956 if self._datastore.knows(datasetRefOrType): 

957 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

958 # Try to write dataset to the datastore, if it fails due to a race 

959 # with another write, the content of stored data may be 

960 # unpredictable. 

961 try: 

962 self._datastore.put(obj, datasetRefOrType) 

963 except IntegrityError as e: 

964 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

965 return datasetRefOrType 

966 

967 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

968 if not self.isWriteable(): 

969 raise TypeError("Butler is read-only.") 

970 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

971 

972 # Handle dimension records in dataId 

973 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

974 

975 # Add Registry Dataset entry. 

976 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs) 

977 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

978 self._datastore.put(obj, ref) 

979 

980 return ref 

981 

982 def getDeferred( 

983 self, 

984 datasetRefOrType: DatasetRef | DatasetType | str, 

985 /, 

986 dataId: DataId | None = None, 

987 *, 

988 parameters: dict | None = None, 

989 collections: Any = None, 

990 storageClass: str | StorageClass | None = None, 

991 **kwargs: Any, 

992 ) -> DeferredDatasetHandle: 

993 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

994 after an immediate registry lookup. 

995 

996 Parameters 

997 ---------- 

998 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

999 When `DatasetRef` the `dataId` should be `None`. 

1000 Otherwise the `DatasetType` or name thereof. 

1001 dataId : `dict` or `DataCoordinate`, optional 

1002 A `dict` of `Dimension` link name, value pairs that label the 

1003 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1004 should be provided as the first argument. 

1005 parameters : `dict` 

1006 Additional StorageClass-defined options to control reading, 

1007 typically used to efficiently read only a subset of the dataset. 

1008 collections : Any, optional 

1009 Collections to be searched, overriding ``self.collections``. 

1010 Can be any of the types supported by the ``collections`` argument 

1011 to butler construction. 

1012 storageClass : `StorageClass` or `str`, optional 

1013 The storage class to be used to override the Python type 

1014 returned by this method. By default the returned type matches 

1015 the dataset type definition for this dataset. Specifying a 

1016 read `StorageClass` can force a different type to be returned. 

1017 This type must be compatible with the original type. 

1018 **kwargs 

1019 Additional keyword arguments used to augment or construct a 

1020 `DataId`. See `DataId` parameters. 

1021 

1022 Returns 

1023 ------- 

1024 obj : `DeferredDatasetHandle` 

1025 A handle which can be used to retrieve a dataset at a later time. 

1026 

1027 Raises 

1028 ------ 

1029 LookupError 

1030 Raised if no matching dataset exists in the `Registry` or 

1031 datastore. 

1032 ValueError 

1033 Raised if a resolved `DatasetRef` was passed as an input, but it 

1034 differs from the one found in the registry. 

1035 TypeError 

1036 Raised if no collections were provided. 

1037 """ 

1038 if isinstance(datasetRefOrType, DatasetRef): 

1039 # Do the quick check first and if that fails, check for artifact 

1040 # existence. This is necessary for datastores that are configured 

1041 # in trust mode where there won't be a record but there will be 

1042 # a file. 

1043 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1044 ref = datasetRefOrType 

1045 else: 

1046 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1047 else: 

1048 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1049 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1050 

1051 def get( 

1052 self, 

1053 datasetRefOrType: DatasetRef | DatasetType | str, 

1054 /, 

1055 dataId: DataId | None = None, 

1056 *, 

1057 parameters: dict[str, Any] | None = None, 

1058 collections: Any = None, 

1059 storageClass: StorageClass | str | None = None, 

1060 **kwargs: Any, 

1061 ) -> Any: 

1062 """Retrieve a stored dataset. 

1063 

1064 Parameters 

1065 ---------- 

1066 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1067 When `DatasetRef` the `dataId` should be `None`. 

1068 Otherwise the `DatasetType` or name thereof. 

1069 If a resolved `DatasetRef`, the associated dataset 

1070 is returned directly without additional querying. 

1071 dataId : `dict` or `DataCoordinate` 

1072 A `dict` of `Dimension` link name, value pairs that label the 

1073 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1074 should be provided as the first argument. 

1075 parameters : `dict` 

1076 Additional StorageClass-defined options to control reading, 

1077 typically used to efficiently read only a subset of the dataset. 

1078 collections : Any, optional 

1079 Collections to be searched, overriding ``self.collections``. 

1080 Can be any of the types supported by the ``collections`` argument 

1081 to butler construction. 

1082 storageClass : `StorageClass` or `str`, optional 

1083 The storage class to be used to override the Python type 

1084 returned by this method. By default the returned type matches 

1085 the dataset type definition for this dataset. Specifying a 

1086 read `StorageClass` can force a different type to be returned. 

1087 This type must be compatible with the original type. 

1088 **kwargs 

1089 Additional keyword arguments used to augment or construct a 

1090 `DataCoordinate`. See `DataCoordinate.standardize` 

1091 parameters. 

1092 

1093 Returns 

1094 ------- 

1095 obj : `object` 

1096 The dataset. 

1097 

1098 Raises 

1099 ------ 

1100 LookupError 

1101 Raised if no matching dataset exists in the `Registry`. 

1102 TypeError 

1103 Raised if no collections were provided. 

1104 

1105 Notes 

1106 ----- 

1107 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1108 this method requires that the given data ID include temporal dimensions 

1109 beyond the dimensions of the dataset type itself, in order to find the 

1110 dataset with the appropriate validity range. For example, a "bias" 

1111 dataset with native dimensions ``{instrument, detector}`` could be 

1112 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1113 ``exposure`` is a temporal dimension. 

1114 """ 

1115 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1116 ref = self._findDatasetRef( 

1117 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs 

1118 ) 

1119 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1120 

1121 def getURIs( 

1122 self, 

1123 datasetRefOrType: DatasetRef | DatasetType | str, 

1124 /, 

1125 dataId: DataId | None = None, 

1126 *, 

1127 predict: bool = False, 

1128 collections: Any = None, 

1129 run: str | None = None, 

1130 **kwargs: Any, 

1131 ) -> DatasetRefURIs: 

1132 """Return the URIs associated with the dataset. 

1133 

1134 Parameters 

1135 ---------- 

1136 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1137 When `DatasetRef` the `dataId` should be `None`. 

1138 Otherwise the `DatasetType` or name thereof. 

1139 dataId : `dict` or `DataCoordinate` 

1140 A `dict` of `Dimension` link name, value pairs that label the 

1141 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1142 should be provided as the first argument. 

1143 predict : `bool` 

1144 If `True`, allow URIs to be returned of datasets that have not 

1145 been written. 

1146 collections : Any, optional 

1147 Collections to be searched, overriding ``self.collections``. 

1148 Can be any of the types supported by the ``collections`` argument 

1149 to butler construction. 

1150 run : `str`, optional 

1151 Run to use for predictions, overriding ``self.run``. 

1152 **kwargs 

1153 Additional keyword arguments used to augment or construct a 

1154 `DataCoordinate`. See `DataCoordinate.standardize` 

1155 parameters. 

1156 

1157 Returns 

1158 ------- 

1159 uris : `DatasetRefURIs` 

1160 The URI to the primary artifact associated with this dataset (if 

1161 the dataset was disassembled within the datastore this may be 

1162 `None`), and the URIs to any components associated with the dataset 

1163 artifact. (can be empty if there are no components). 

1164 """ 

1165 ref = self._findDatasetRef( 

1166 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1167 ) 

1168 return self._datastore.getURIs(ref, predict) 

1169 

1170 def get_dataset_type(self, name: str) -> DatasetType: 

1171 return self._registry.getDatasetType(name) 

1172 

1173 def get_dataset( 

1174 self, 

1175 id: DatasetId, 

1176 *, 

1177 storage_class: str | StorageClass | None = None, 

1178 dimension_records: bool = False, 

1179 datastore_records: bool = False, 

1180 ) -> DatasetRef | None: 

1181 ref = self._registry.getDataset(id) 

1182 if ref is not None: 

1183 if dimension_records: 

1184 ref = ref.expanded( 

1185 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions) 

1186 ) 

1187 if storage_class: 

1188 ref = ref.overrideStorageClass(storage_class) 

1189 if datastore_records: 

1190 ref = self._registry.get_datastore_records(ref) 

1191 return ref 

1192 

1193 def find_dataset( 

1194 self, 

1195 dataset_type: DatasetType | str, 

1196 data_id: DataId | None = None, 

1197 *, 

1198 collections: str | Sequence[str] | None = None, 

1199 timespan: Timespan | None = None, 

1200 storage_class: str | StorageClass | None = None, 

1201 dimension_records: bool = False, 

1202 datastore_records: bool = False, 

1203 **kwargs: Any, 

1204 ) -> DatasetRef | None: 

1205 # Handle any parts of the dataID that are not using primary dimension 

1206 # keys. 

1207 if isinstance(dataset_type, str): 

1208 actual_type = self.get_dataset_type(dataset_type) 

1209 else: 

1210 actual_type = dataset_type 

1211 

1212 # Store the component for later. 

1213 component_name = actual_type.component() 

1214 if actual_type.isComponent(): 

1215 parent_type = actual_type.makeCompositeDatasetType() 

1216 else: 

1217 parent_type = actual_type 

1218 

1219 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs) 

1220 

1221 ref = self._registry.findDataset( 

1222 parent_type, 

1223 data_id, 

1224 collections=collections, 

1225 timespan=timespan, 

1226 datastore_records=datastore_records, 

1227 **kwargs, 

1228 ) 

1229 if ref is not None and dimension_records: 

1230 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)) 

1231 if ref is not None and component_name: 

1232 ref = ref.makeComponentRef(component_name) 

1233 if ref is not None and storage_class is not None: 

1234 ref = ref.overrideStorageClass(storage_class) 

1235 

1236 return ref 

1237 

1238 def retrieveArtifacts( 

1239 self, 

1240 refs: Iterable[DatasetRef], 

1241 destination: ResourcePathExpression, 

1242 transfer: str = "auto", 

1243 preserve_path: bool = True, 

1244 overwrite: bool = False, 

1245 ) -> list[ResourcePath]: 

1246 # Docstring inherited. 

1247 return self._datastore.retrieveArtifacts( 

1248 refs, 

1249 ResourcePath(destination), 

1250 transfer=transfer, 

1251 preserve_path=preserve_path, 

1252 overwrite=overwrite, 

1253 ) 

1254 

1255 def exists( 

1256 self, 

1257 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1258 /, 

1259 data_id: DataId | None = None, 

1260 *, 

1261 full_check: bool = True, 

1262 collections: Any = None, 

1263 **kwargs: Any, 

1264 ) -> DatasetExistence: 

1265 # Docstring inherited. 

1266 existence = DatasetExistence.UNRECOGNIZED 

1267 

1268 if isinstance(dataset_ref_or_type, DatasetRef): 

1269 if collections is not None: 

1270 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1271 if data_id is not None: 

1272 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1273 ref = dataset_ref_or_type 

1274 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1275 if registry_ref is not None: 

1276 existence |= DatasetExistence.RECORDED 

1277 

1278 if dataset_ref_or_type != registry_ref: 

1279 # This could mean that storage classes differ, so we should 

1280 # check for that but use the registry ref for the rest of 

1281 # the method. 

1282 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1283 # Use the registry version from now on. 

1284 ref = registry_ref 

1285 else: 

1286 raise ValueError( 

1287 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1288 f"in registry but has different incompatible values ({registry_ref})." 

1289 ) 

1290 else: 

1291 try: 

1292 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1293 except (LookupError, TypeError): 

1294 return existence 

1295 existence |= DatasetExistence.RECORDED 

1296 

1297 if self._datastore.knows(ref): 

1298 existence |= DatasetExistence.DATASTORE 

1299 

1300 if full_check: 

1301 if self._datastore.exists(ref): 

1302 existence |= DatasetExistence._ARTIFACT 

1303 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1304 # Do not add this flag if we have no other idea about a dataset. 

1305 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1306 

1307 return existence 

1308 

1309 def _exists_many( 

1310 self, 

1311 refs: Iterable[DatasetRef], 

1312 /, 

1313 *, 

1314 full_check: bool = True, 

1315 ) -> dict[DatasetRef, DatasetExistence]: 

1316 # Docstring inherited. 

1317 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1318 

1319 # Registry does not have a bulk API to check for a ref. 

1320 for ref in refs: 

1321 registry_ref = self._registry.getDataset(ref.id) 

1322 if registry_ref is not None: 

1323 # It is possible, albeit unlikely, that the given ref does 

1324 # not match the one in registry even though the UUID matches. 

1325 # When checking a single ref we raise, but it's impolite to 

1326 # do that when potentially hundreds of refs are being checked. 

1327 # We could change the API to only accept UUIDs and that would 

1328 # remove the ability to even check and remove the worry 

1329 # about differing storage classes. Given the ongoing discussion 

1330 # on refs vs UUIDs and whether to raise or have a new 

1331 # private flag, treat this as a private API for now. 

1332 existence[ref] |= DatasetExistence.RECORDED 

1333 

1334 # Ask datastore if it knows about these refs. 

1335 knows = self._datastore.knows_these(refs) 

1336 for ref, known in knows.items(): 

1337 if known: 

1338 existence[ref] |= DatasetExistence.DATASTORE 

1339 

1340 if full_check: 

1341 mexists = self._datastore.mexists(refs) 

1342 for ref, exists in mexists.items(): 

1343 if exists: 

1344 existence[ref] |= DatasetExistence._ARTIFACT 

1345 else: 

1346 # Do not set this flag if nothing is known about the dataset. 

1347 for ref in existence: 

1348 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1349 existence[ref] |= DatasetExistence._ASSUMED 

1350 

1351 return existence 

1352 

1353 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1354 # Docstring inherited. 

1355 if not self.isWriteable(): 

1356 raise TypeError("Butler is read-only.") 

1357 names = list(names) 

1358 refs: list[DatasetRef] = [] 

1359 for name in names: 

1360 collectionType = self._registry.getCollectionType(name) 

1361 if collectionType is not CollectionType.RUN: 

1362 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1363 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1364 with self._datastore.transaction(), self._registry.transaction(): 

1365 if unstore: 

1366 self._datastore.trash(refs) 

1367 else: 

1368 self._datastore.forget(refs) 

1369 for name in names: 

1370 self._registry.removeCollection(name) 

1371 if unstore: 

1372 # Point of no return for removing artifacts 

1373 self._datastore.emptyTrash() 

1374 

1375 def pruneDatasets( 

1376 self, 

1377 refs: Iterable[DatasetRef], 

1378 *, 

1379 disassociate: bool = True, 

1380 unstore: bool = False, 

1381 tags: Iterable[str] = (), 

1382 purge: bool = False, 

1383 ) -> None: 

1384 # docstring inherited from LimitedButler 

1385 

1386 if not self.isWriteable(): 

1387 raise TypeError("Butler is read-only.") 

1388 if purge: 

1389 if not disassociate: 

1390 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1391 if not unstore: 

1392 raise TypeError("Cannot pass purge=True without unstore=True.") 

1393 elif disassociate: 

1394 tags = tuple(tags) 

1395 if not tags: 

1396 raise TypeError("No tags provided but disassociate=True.") 

1397 for tag in tags: 

1398 collectionType = self._registry.getCollectionType(tag) 

1399 if collectionType is not CollectionType.TAGGED: 

1400 raise TypeError( 

1401 f"Cannot disassociate from collection '{tag}' " 

1402 f"of non-TAGGED type {collectionType.name}." 

1403 ) 

1404 # Transform possibly-single-pass iterable into something we can iterate 

1405 # over multiple times. 

1406 refs = list(refs) 

1407 # Pruning a component of a DatasetRef makes no sense since registry 

1408 # doesn't know about components and datastore might not store 

1409 # components in a separate file 

1410 for ref in refs: 

1411 if ref.datasetType.component(): 

1412 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1413 # We don't need an unreliable Datastore transaction for this, because 

1414 # we've been extra careful to ensure that Datastore.trash only involves 

1415 # mutating the Registry (it can _look_ at Datastore-specific things, 

1416 # but shouldn't change them), and hence all operations here are 

1417 # Registry operations. 

1418 with self._datastore.transaction(), self._registry.transaction(): 

1419 if unstore: 

1420 self._datastore.trash(refs) 

1421 if purge: 

1422 self._registry.removeDatasets(refs) 

1423 elif disassociate: 

1424 assert tags, "Guaranteed by earlier logic in this function." 

1425 for tag in tags: 

1426 self._registry.disassociate(tag, refs) 

1427 # We've exited the Registry transaction, and apparently committed. 

1428 # (if there was an exception, everything rolled back, and it's as if 

1429 # nothing happened - and we never get here). 

1430 # Datastore artifacts are not yet gone, but they're clearly marked 

1431 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1432 # problems we can try again later, and if manual administrative 

1433 # intervention is required, it's pretty clear what that should entail: 

1434 # deleting everything on disk and in private Datastore tables that is 

1435 # in the dataset_location_trash table. 

1436 if unstore: 

1437 # Point of no return for removing artifacts 

1438 self._datastore.emptyTrash() 

1439 

1440 @transactional 

1441 def ingest( 

1442 self, 

1443 *datasets: FileDataset, 

1444 transfer: str | None = "auto", 

1445 record_validation_info: bool = True, 

1446 ) -> None: 

1447 # Docstring inherited. 

1448 if not self.isWriteable(): 

1449 raise TypeError("Butler is read-only.") 

1450 

1451 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1452 if not datasets: 

1453 return 

1454 

1455 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1456 

1457 # We need to reorganize all the inputs so that they are grouped 

1458 # by dataset type and run. Multiple refs in a single FileDataset 

1459 # are required to share the run and dataset type. 

1460 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list) 

1461 

1462 # Track DataIDs that are being ingested so we can spot issues early 

1463 # with duplication. Retain previous FileDataset so we can report it. 

1464 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = ( 

1465 defaultdict(dict) 

1466 ) 

1467 

1468 # And the nested loop that populates it: 

1469 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1470 # Somewhere to store pre-existing refs if we have an 

1471 # execution butler. 

1472 existingRefs: list[DatasetRef] = [] 

1473 

1474 for ref in dataset.refs: 

1475 group_key = (ref.datasetType, ref.run) 

1476 

1477 if ref.dataId in groupedDataIds[group_key]: 

1478 raise ConflictingDefinitionError( 

1479 f"Ingest conflict. Dataset {dataset.path} has same" 

1480 " DataId as other ingest dataset" 

1481 f" {groupedDataIds[group_key][ref.dataId].path} " 

1482 f" ({ref.dataId})" 

1483 ) 

1484 

1485 groupedDataIds[group_key][ref.dataId] = dataset 

1486 

1487 if existingRefs: 

1488 if len(dataset.refs) != len(existingRefs): 

1489 # Keeping track of partially pre-existing datasets is hard 

1490 # and should generally never happen. For now don't allow 

1491 # it. 

1492 raise ConflictingDefinitionError( 

1493 f"For dataset {dataset.path} some dataIds already exist" 

1494 " in registry but others do not. This is not supported." 

1495 ) 

1496 

1497 # Store expanded form in the original FileDataset. 

1498 dataset.refs = existingRefs 

1499 else: 

1500 groupedData[group_key].append(dataset) 

1501 

1502 # Now we can bulk-insert into Registry for each DatasetType. 

1503 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1504 groupedData.items(), desc="Bulk-inserting datasets by type" 

1505 ): 

1506 refs_to_import = [] 

1507 for dataset in grouped_datasets: 

1508 refs_to_import.extend(dataset.refs) 

1509 

1510 n_refs = len(refs_to_import) 

1511 _LOG.verbose( 

1512 "Importing %d ref%s of dataset type %r into run %r", 

1513 n_refs, 

1514 "" if n_refs == 1 else "s", 

1515 datasetType.name, 

1516 this_run, 

1517 ) 

1518 

1519 # Import the refs and expand the DataCoordinates since we can't 

1520 # guarantee that they are expanded and Datastore will need 

1521 # the records. 

1522 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1523 assert set(imported_refs) == set(refs_to_import) 

1524 

1525 # Replace all the refs in the FileDataset with expanded versions. 

1526 # Pull them off in the order we put them on the list. 

1527 for dataset in grouped_datasets: 

1528 n_dataset_refs = len(dataset.refs) 

1529 dataset.refs = imported_refs[:n_dataset_refs] 

1530 del imported_refs[:n_dataset_refs] 

1531 

1532 # Bulk-insert everything into Datastore. 

1533 # We do not know if any of the registry entries already existed 

1534 # (_importDatasets only complains if they exist but differ) so 

1535 # we have to catch IntegrityError explicitly. 

1536 try: 

1537 self._datastore.ingest( 

1538 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1539 ) 

1540 except IntegrityError as e: 

1541 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1542 

1543 @contextlib.contextmanager 

1544 def export( 

1545 self, 

1546 *, 

1547 directory: str | None = None, 

1548 filename: str | None = None, 

1549 format: str | None = None, 

1550 transfer: str | None = None, 

1551 ) -> Iterator[RepoExportContext]: 

1552 # Docstring inherited. 

1553 if directory is None and transfer is not None: 

1554 raise TypeError("Cannot transfer without providing a directory.") 

1555 if transfer == "move": 

1556 raise TypeError("Transfer may not be 'move': export is read-only") 

1557 if format is None: 

1558 if filename is None: 

1559 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1560 else: 

1561 _, format = os.path.splitext(filename) 

1562 if not format: 

1563 raise ValueError("Please specify a file extension to determine export format.") 

1564 format = format[1:] # Strip leading "."" 

1565 elif filename is None: 

1566 filename = f"export.{format}" 

1567 if directory is not None: 

1568 filename = os.path.join(directory, filename) 

1569 formats = self._config["repo_transfer_formats"] 

1570 if format not in formats: 

1571 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1572 BackendClass = get_class_of(formats[format, "export"]) 

1573 with open(filename, "w") as stream: 

1574 backend = BackendClass(stream, universe=self.dimensions) 

1575 try: 

1576 helper = RepoExportContext( 

1577 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1578 ) 

1579 with self._caching_context(): 

1580 yield helper 

1581 except BaseException: 

1582 raise 

1583 else: 

1584 helper._finish() 

1585 

1586 def import_( 

1587 self, 

1588 *, 

1589 directory: ResourcePathExpression | None = None, 

1590 filename: ResourcePathExpression | TextIO | None = None, 

1591 format: str | None = None, 

1592 transfer: str | None = None, 

1593 skip_dimensions: set | None = None, 

1594 ) -> None: 

1595 # Docstring inherited. 

1596 if not self.isWriteable(): 

1597 raise TypeError("Butler is read-only.") 

1598 if format is None: 

1599 if filename is None: 

1600 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1601 else: 

1602 _, format = os.path.splitext(filename) # type: ignore 

1603 elif filename is None: 

1604 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1605 if directory is not None: 

1606 directory = ResourcePath(directory, forceDirectory=True) 

1607 # mypy doesn't think this will work but it does in python >= 3.10. 

1608 if isinstance(filename, ResourcePathExpression): # type: ignore 

1609 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1610 if not filename.isabs() and directory is not None: 

1611 potential = directory.join(filename) 

1612 exists_in_cwd = filename.exists() 

1613 exists_in_dir = potential.exists() 

1614 if exists_in_cwd and exists_in_dir: 

1615 _LOG.warning( 

1616 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1617 "Additionally, the file exists relative to the given search directory (%s). " 

1618 "Using the export file in the given directory.", 

1619 filename, 

1620 potential, 

1621 ) 

1622 # Given they specified an explicit directory and that 

1623 # directory has the export file in it, assume that that 

1624 # is what was meant despite the file in cwd. 

1625 filename = potential 

1626 elif exists_in_dir: 

1627 filename = potential 

1628 elif not exists_in_cwd and not exists_in_dir: 

1629 # Raise early. 

1630 raise FileNotFoundError( 

1631 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1632 ) 

1633 BackendClass: type[RepoImportBackend] = get_class_of( 

1634 self._config["repo_transfer_formats"][format]["import"] 

1635 ) 

1636 

1637 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1638 with self._caching_context(): 

1639 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1640 backend.register() 

1641 with self.transaction(): 

1642 backend.load( 

1643 self._datastore, 

1644 directory=directory, 

1645 transfer=transfer, 

1646 skip_dimensions=skip_dimensions, 

1647 ) 

1648 

1649 if isinstance(filename, ResourcePath): 

1650 # We can not use open() here at the moment because of 

1651 # DM-38589 since yaml does stream.read(8192) in a loop. 

1652 stream = io.StringIO(filename.read().decode()) 

1653 doImport(stream) 

1654 else: 

1655 doImport(filename) # type: ignore 

1656 

1657 def transfer_dimension_records_from( 

1658 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef] 

1659 ) -> None: 

1660 # Allowed dimensions in the target butler. 

1661 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1662 

1663 data_ids = {ref.dataId for ref in source_refs} 

1664 

1665 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1666 source_butler, data_ids, elements 

1667 ) 

1668 

1669 # Insert order is important. 

1670 for element in self.dimensions.sorted(dimension_records.keys()): 

1671 records = [r for r in dimension_records[element].values()] 

1672 # Assume that if the record is already present that we can 

1673 # use it without having to check that the record metadata 

1674 # is consistent. 

1675 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1676 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records)) 

1677 

1678 def _extract_all_dimension_records_from_data_ids( 

1679 self, 

1680 source_butler: LimitedButler | Butler, 

1681 data_ids: set[DataCoordinate], 

1682 allowed_elements: frozenset[DimensionElement], 

1683 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1684 primary_records = self._extract_dimension_records_from_data_ids( 

1685 source_butler, data_ids, allowed_elements 

1686 ) 

1687 

1688 can_query = True if isinstance(source_butler, Butler) else False 

1689 

1690 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1691 for original_element, record_mapping in primary_records.items(): 

1692 # Get dimensions that depend on this dimension. 

1693 populated_by = self.dimensions.get_elements_populated_by( 

1694 self.dimensions[original_element.name] # type: ignore 

1695 ) 

1696 

1697 for data_id in record_mapping.keys(): 

1698 for element in populated_by: 

1699 if element not in allowed_elements: 

1700 continue 

1701 if element.name == original_element.name: 

1702 continue 

1703 

1704 if element.name in primary_records: 

1705 # If this element has already been stored avoid 

1706 # re-finding records since that may lead to additional 

1707 # spurious records. e.g. visit is populated_by 

1708 # visit_detector_region but querying 

1709 # visit_detector_region by visit will return all the 

1710 # detectors for this visit -- the visit dataId does not 

1711 # constrain this. 

1712 # To constrain the query the original dataIds would 

1713 # have to be scanned. 

1714 continue 

1715 

1716 if not can_query: 

1717 raise RuntimeError( 

1718 f"Transferring populated_by records like {element.name} requires a full Butler." 

1719 ) 

1720 

1721 records = source_butler.registry.queryDimensionRecords( # type: ignore 

1722 element.name, 

1723 **data_id.mapping, # type: ignore 

1724 ) 

1725 for record in records: 

1726 additional_records[record.definition].setdefault(record.dataId, record) 

1727 

1728 # The next step is to walk back through the additional records to 

1729 # pick up any missing content (such as visit_definition needing to 

1730 # know the exposure). Want to ensure we do not request records we 

1731 # already have. 

1732 missing_data_ids = set() 

1733 for name, record_mapping in additional_records.items(): 

1734 for data_id in record_mapping.keys(): 

1735 if data_id not in primary_records[name]: 

1736 missing_data_ids.add(data_id) 

1737 

1738 # Fill out the new records. Assume that these new records do not 

1739 # also need to carry over additional populated_by records. 

1740 secondary_records = self._extract_dimension_records_from_data_ids( 

1741 source_butler, missing_data_ids, allowed_elements 

1742 ) 

1743 

1744 # Merge the extra sets of records in with the original. 

1745 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()): 

1746 primary_records[name].update(record_mapping) 

1747 

1748 return primary_records 

1749 

1750 def _extract_dimension_records_from_data_ids( 

1751 self, 

1752 source_butler: LimitedButler | Butler, 

1753 data_ids: set[DataCoordinate], 

1754 allowed_elements: frozenset[DimensionElement], 

1755 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1756 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1757 

1758 for data_id in data_ids: 

1759 # Need an expanded record, if not expanded that we need a full 

1760 # butler with registry (allow mocks with registry too). 

1761 if not data_id.hasRecords(): 

1762 if registry := getattr(source_butler, "registry", None): 

1763 data_id = registry.expandDataId(data_id) 

1764 else: 

1765 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1766 # If this butler doesn't know about a dimension in the source 

1767 # butler things will break later. 

1768 for element_name in data_id.dimensions.elements: 

1769 record = data_id.records[element_name] 

1770 if record is not None and record.definition in allowed_elements: 

1771 dimension_records[record.definition].setdefault(record.dataId, record) 

1772 

1773 return dimension_records 

1774 

1775 def transfer_from( 

1776 self, 

1777 source_butler: LimitedButler, 

1778 source_refs: Iterable[DatasetRef], 

1779 transfer: str = "auto", 

1780 skip_missing: bool = True, 

1781 register_dataset_types: bool = False, 

1782 transfer_dimensions: bool = False, 

1783 dry_run: bool = False, 

1784 ) -> collections.abc.Collection[DatasetRef]: 

1785 # Docstring inherited. 

1786 if not self.isWriteable(): 

1787 raise TypeError("Butler is read-only.") 

1788 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1789 

1790 # Will iterate through the refs multiple times so need to convert 

1791 # to a list if this isn't a collection. 

1792 if not isinstance(source_refs, collections.abc.Collection): 

1793 source_refs = list(source_refs) 

1794 

1795 original_count = len(source_refs) 

1796 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

1797 

1798 # In some situations the datastore artifact may be missing 

1799 # and we do not want that registry entry to be imported. 

1800 # Asking datastore is not sufficient, the records may have been 

1801 # purged, we have to ask for the (predicted) URI and check 

1802 # existence explicitly. Execution butler is set up exactly like 

1803 # this with no datastore records. 

1804 artifact_existence: dict[ResourcePath, bool] = {} 

1805 if skip_missing: 

1806 dataset_existence = source_butler._datastore.mexists( 

1807 source_refs, artifact_existence=artifact_existence 

1808 ) 

1809 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1810 filtered_count = len(source_refs) 

1811 n_missing = original_count - filtered_count 

1812 _LOG.verbose( 

1813 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

1814 n_missing, 

1815 "" if n_missing == 1 else "s", 

1816 filtered_count, 

1817 ) 

1818 

1819 # Importing requires that we group the refs by dataset type and run 

1820 # before doing the import. 

1821 source_dataset_types = set() 

1822 grouped_refs = defaultdict(list) 

1823 for ref in source_refs: 

1824 grouped_refs[ref.datasetType, ref.run].append(ref) 

1825 source_dataset_types.add(ref.datasetType) 

1826 

1827 # Check to see if the dataset type in the source butler has 

1828 # the same definition in the target butler and register missing 

1829 # ones if requested. Registration must happen outside a transaction. 

1830 newly_registered_dataset_types = set() 

1831 for datasetType in source_dataset_types: 

1832 if register_dataset_types: 

1833 # Let this raise immediately if inconsistent. Continuing 

1834 # on to find additional inconsistent dataset types 

1835 # might result in additional unwanted dataset types being 

1836 # registered. 

1837 if self._registry.registerDatasetType(datasetType): 

1838 newly_registered_dataset_types.add(datasetType) 

1839 else: 

1840 # If the dataset type is missing, let it fail immediately. 

1841 target_dataset_type = self.get_dataset_type(datasetType.name) 

1842 if target_dataset_type != datasetType: 

1843 raise ConflictingDefinitionError( 

1844 "Source butler dataset type differs from definition" 

1845 f" in target butler: {datasetType} !=" 

1846 f" {target_dataset_type}" 

1847 ) 

1848 if newly_registered_dataset_types: 

1849 # We may have registered some even if there were inconsistencies 

1850 # but should let people know (or else remove them again). 

1851 _LOG.verbose( 

1852 "Registered the following dataset types in the target Butler: %s", 

1853 ", ".join(d.name for d in newly_registered_dataset_types), 

1854 ) 

1855 else: 

1856 _LOG.verbose("All required dataset types are known to the target Butler") 

1857 

1858 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1859 if transfer_dimensions: 

1860 # Collect all the dimension records for these refs. 

1861 # All dimensions are to be copied but the list of valid dimensions 

1862 # come from this butler's universe. 

1863 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1864 dataIds = {ref.dataId for ref in source_refs} 

1865 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1866 source_butler, dataIds, elements 

1867 ) 

1868 

1869 handled_collections: set[str] = set() 

1870 

1871 # Do all the importing in a single transaction. 

1872 with self.transaction(): 

1873 if dimension_records and not dry_run: 

1874 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

1875 # Order matters. 

1876 for element in self.dimensions.sorted(dimension_records.keys()): 

1877 records = [r for r in dimension_records[element].values()] 

1878 # Assume that if the record is already present that we can 

1879 # use it without having to check that the record metadata 

1880 # is consistent. 

1881 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1882 

1883 n_imported = 0 

1884 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

1885 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

1886 ): 

1887 if run not in handled_collections: 

1888 # May need to create output collection. If source butler 

1889 # has a registry, ask for documentation string. 

1890 run_doc = None 

1891 if registry := getattr(source_butler, "registry", None): 

1892 run_doc = registry.getCollectionDocumentation(run) 

1893 if not dry_run: 

1894 registered = self._registry.registerRun(run, doc=run_doc) 

1895 else: 

1896 registered = True 

1897 handled_collections.add(run) 

1898 if registered: 

1899 _LOG.verbose("Creating output run %s", run) 

1900 

1901 n_refs = len(refs_to_import) 

1902 _LOG.verbose( 

1903 "Importing %d ref%s of dataset type %s into run %s", 

1904 n_refs, 

1905 "" if n_refs == 1 else "s", 

1906 datasetType.name, 

1907 run, 

1908 ) 

1909 

1910 # Assume we are using UUIDs and the source refs will match 

1911 # those imported. 

1912 if not dry_run: 

1913 imported_refs = self._registry._importDatasets(refs_to_import) 

1914 else: 

1915 imported_refs = refs_to_import 

1916 assert set(imported_refs) == set(refs_to_import) 

1917 n_imported += len(imported_refs) 

1918 

1919 assert len(source_refs) == n_imported 

1920 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

1921 

1922 # Ask the datastore to transfer. The datastore has to check that 

1923 # the source datastore is compatible with the target datastore. 

1924 accepted, rejected = self._datastore.transfer_from( 

1925 source_butler._datastore, 

1926 source_refs, 

1927 transfer=transfer, 

1928 artifact_existence=artifact_existence, 

1929 dry_run=dry_run, 

1930 ) 

1931 if rejected: 

1932 # For now, accept the registry entries but not the files. 

1933 _LOG.warning( 

1934 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

1935 len(rejected), 

1936 len(accepted), 

1937 datasetType, 

1938 run, 

1939 ) 

1940 

1941 return source_refs 

1942 

1943 def validateConfiguration( 

1944 self, 

1945 logFailures: bool = False, 

1946 datasetTypeNames: Iterable[str] | None = None, 

1947 ignore: Iterable[str] | None = None, 

1948 ) -> None: 

1949 # Docstring inherited. 

1950 if datasetTypeNames: 

1951 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

1952 else: 

1953 datasetTypes = list(self._registry.queryDatasetTypes()) 

1954 

1955 # filter out anything from the ignore list 

1956 if ignore: 

1957 ignore = set(ignore) 

1958 datasetTypes = [ 

1959 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

1960 ] 

1961 else: 

1962 ignore = set() 

1963 

1964 # For each datasetType that has an instrument dimension, create 

1965 # a DatasetRef for each defined instrument 

1966 datasetRefs = [] 

1967 

1968 # Find all the registered instruments (if "instrument" is in the 

1969 # universe). 

1970 if "instrument" in self.dimensions: 

1971 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

1972 

1973 for datasetType in datasetTypes: 

1974 if "instrument" in datasetType.dimensions: 

1975 # In order to create a conforming dataset ref, create 

1976 # fake DataCoordinate values for the non-instrument 

1977 # dimensions. The type of the value does not matter here. 

1978 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"} 

1979 

1980 for instrument in instruments: 

1981 datasetRef = DatasetRef( 

1982 datasetType, 

1983 DataCoordinate.standardize( 

1984 dataId, instrument=instrument, dimensions=datasetType.dimensions 

1985 ), 

1986 run="validate", 

1987 ) 

1988 datasetRefs.append(datasetRef) 

1989 

1990 entities: list[DatasetType | DatasetRef] = [] 

1991 entities.extend(datasetTypes) 

1992 entities.extend(datasetRefs) 

1993 

1994 datastoreErrorStr = None 

1995 try: 

1996 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

1997 except ValidationError as e: 

1998 datastoreErrorStr = str(e) 

1999 

2000 # Also check that the LookupKeys used by the datastores match 

2001 # registry and storage class definitions 

2002 keys = self._datastore.getLookupKeys() 

2003 

2004 failedNames = set() 

2005 failedDataId = set() 

2006 for key in keys: 

2007 if key.name is not None: 

2008 if key.name in ignore: 

2009 continue 

2010 

2011 # skip if specific datasetType names were requested and this 

2012 # name does not match 

2013 if datasetTypeNames and key.name not in datasetTypeNames: 

2014 continue 

2015 

2016 # See if it is a StorageClass or a DatasetType 

2017 if key.name in self.storageClasses: 

2018 pass 

2019 else: 

2020 try: 

2021 self.get_dataset_type(key.name) 

2022 except KeyError: 

2023 if logFailures: 

2024 _LOG.critical( 

2025 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2026 ) 

2027 failedNames.add(key) 

2028 else: 

2029 # Dimensions are checked for consistency when the Butler 

2030 # is created and rendezvoused with a universe. 

2031 pass 

2032 

2033 # Check that the instrument is a valid instrument 

2034 # Currently only support instrument so check for that 

2035 if key.dataId: 

2036 dataIdKeys = set(key.dataId) 

2037 if {"instrument"} != dataIdKeys: 

2038 if logFailures: 

2039 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2040 failedDataId.add(key) 

2041 elif key.dataId["instrument"] not in instruments: 

2042 if logFailures: 

2043 _LOG.critical("Key '%s' has unknown instrument", key) 

2044 failedDataId.add(key) 

2045 

2046 messages = [] 

2047 

2048 if datastoreErrorStr: 

2049 messages.append(datastoreErrorStr) 

2050 

2051 for failed, msg in ( 

2052 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2053 (failedDataId, "Keys with bad DataId entries: "), 

2054 ): 

2055 if failed: 

2056 msg += ", ".join(str(k) for k in failed) 

2057 messages.append(msg) 

2058 

2059 if messages: 

2060 raise ValidationError(";\n".join(messages)) 

2061 

2062 @property 

2063 def collections(self) -> Sequence[str]: 

2064 """The collections to search by default, in order 

2065 (`~collections.abc.Sequence` [ `str` ]). 

2066 

2067 This is an alias for ``self.registry.defaults.collections``. It cannot 

2068 be set directly in isolation, but all defaults may be changed together 

2069 by assigning a new `RegistryDefaults` instance to 

2070 ``self.registry.defaults``. 

2071 """ 

2072 return self._registry.defaults.collections 

2073 

2074 @property 

2075 def run(self) -> str | None: 

2076 """Name of the run this butler writes outputs to by default (`str` or 

2077 `None`). 

2078 

2079 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2080 directly in isolation, but all defaults may be changed together by 

2081 assigning a new `RegistryDefaults` instance to 

2082 ``self.registry.defaults``. 

2083 """ 

2084 return self._registry.defaults.run 

2085 

2086 @property 

2087 def registry(self) -> Registry: 

2088 """The object that manages dataset metadata and relationships 

2089 (`Registry`). 

2090 

2091 Many operations that don't involve reading or writing butler datasets 

2092 are accessible only via `Registry` methods. Eventually these methods 

2093 will be replaced by equivalent `Butler` methods. 

2094 """ 

2095 return self._registry_shim 

2096 

2097 @property 

2098 def dimensions(self) -> DimensionUniverse: 

2099 # Docstring inherited. 

2100 return self._registry.dimensions 

2101 

2102 @contextlib.contextmanager 

2103 def _query(self) -> Iterator[Query]: 

2104 # Docstring inherited. 

2105 raise NotImplementedError("TODO DM-41159") 

2106 

2107 def _preload_cache(self) -> None: 

2108 """Immediately load caches that are used for common operations.""" 

2109 self._registry.preload_cache() 

2110 

2111 _config: ButlerConfig 

2112 """Configuration for this Butler instance.""" 

2113 

2114 _registry: SqlRegistry 

2115 """The object that manages dataset metadata and relationships 

2116 (`SqlRegistry`). 

2117 

2118 Most operations that don't involve reading or writing butler datasets are 

2119 accessible only via `SqlRegistry` methods. 

2120 """ 

2121 

2122 datastore: Datastore 

2123 """The object that manages actual dataset storage (`Datastore`). 

2124 

2125 Direct user access to the datastore should rarely be necessary; the primary 

2126 exception is the case where a `Datastore` implementation provides extra 

2127 functionality beyond what the base class defines. 

2128 """ 

2129 

2130 storageClasses: StorageClassFactory 

2131 """An object that maps known storage class names to objects that fully 

2132 describe them (`StorageClassFactory`). 

2133 """ 

2134 

2135 _registry_shim: RegistryShim 

2136 """Shim object to provide a legacy public interface for querying via the 

2137 the ``registry`` property. 

2138 """