Coverage for python/lsst/daf/butler/direct_butler.py: 10%

753 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-05 10:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import itertools 

41import logging 

42import numbers 

43import os 

44import warnings 

45from collections import Counter, defaultdict 

46from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

47from typing import TYPE_CHECKING, Any, ClassVar, TextIO, cast 

48 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils.introspection import get_class_of 

51from lsst.utils.iteration import ensure_iterable 

52from lsst.utils.logging import VERBOSE, getLogger 

53from sqlalchemy.exc import IntegrityError 

54 

55from ._butler import Butler 

56from ._butler_config import ButlerConfig 

57from ._butler_instance_options import ButlerInstanceOptions 

58from ._dataset_existence import DatasetExistence 

59from ._dataset_ref import DatasetRef 

60from ._dataset_type import DatasetType 

61from ._deferredDatasetHandle import DeferredDatasetHandle 

62from ._exceptions import DatasetNotFoundError, DimensionValueError, ValidationError 

63from ._limited_butler import LimitedButler 

64from ._registry_shim import RegistryShim 

65from ._storage_class import StorageClass, StorageClassFactory 

66from ._timespan import Timespan 

67from .datastore import Datastore, NullDatastore 

68from .dimensions import DataCoordinate, Dimension 

69from .progress import Progress 

70from .queries import Query 

71from .registry import ( 

72 CollectionType, 

73 ConflictingDefinitionError, 

74 DataIdError, 

75 MissingDatasetTypeError, 

76 RegistryDefaults, 

77 _RegistryFactory, 

78) 

79from .registry.sql_registry import SqlRegistry 

80from .transfers import RepoExportContext 

81from .utils import transactional 

82 

83if TYPE_CHECKING: 

84 from lsst.resources import ResourceHandleProtocol 

85 

86 from ._dataset_ref import DatasetId 

87 from ._file_dataset import FileDataset 

88 from .datastore import DatasetRefURIs 

89 from .dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse 

90 from .registry import Registry 

91 from .transfers import RepoImportBackend 

92 

93_LOG = getLogger(__name__) 

94 

95 

96class ButlerValidationError(ValidationError): 

97 """There is a problem with the Butler configuration.""" 

98 

99 pass 

100 

101 

102class DirectButler(Butler): # numpydoc ignore=PR02 

103 """Main entry point for the data access system. 

104 

105 Parameters 

106 ---------- 

107 config : `ButlerConfig` 

108 The configuration for this Butler instance. 

109 registry : `SqlRegistry` 

110 The object that manages dataset metadata and relationships. 

111 datastore : Datastore 

112 The object that manages actual dataset storage. 

113 storageClasses : StorageClassFactory 

114 An object that maps known storage class names to objects that fully 

115 describe them. 

116 

117 Notes 

118 ----- 

119 Most users should call the top-level `Butler`.``from_config`` instead of 

120 using this constructor directly. 

121 """ 

122 

123 # This is __new__ instead of __init__ because we have to support 

124 # instantiation via the legacy constructor Butler.__new__(), which 

125 # reads the configuration and selects which subclass to instantiate. The 

126 # interaction between __new__ and __init__ is kind of wacky in Python. If 

127 # we were using __init__ here, __init__ would be called twice (once when 

128 # the DirectButler instance is constructed inside Butler.from_config(), and 

129 # a second time with the original arguments to Butler() when the instance 

130 # is returned from Butler.__new__() 

131 def __new__( 

132 cls, 

133 *, 

134 config: ButlerConfig, 

135 registry: SqlRegistry, 

136 datastore: Datastore, 

137 storageClasses: StorageClassFactory, 

138 ) -> DirectButler: 

139 self = cast(DirectButler, super().__new__(cls)) 

140 self._config = config 

141 self._registry = registry 

142 self._datastore = datastore 

143 self.storageClasses = storageClasses 

144 

145 # For execution butler the datastore needs a special 

146 # dependency-inversion trick. This is not used by regular butler, 

147 # but we do not have a way to distinguish regular butler from execution 

148 # butler. 

149 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

150 

151 self._registry_shim = RegistryShim(self) 

152 

153 return self 

154 

155 @classmethod 

156 def create_from_config( 

157 cls, 

158 config: ButlerConfig, 

159 *, 

160 options: ButlerInstanceOptions, 

161 without_datastore: bool = False, 

162 ) -> DirectButler: 

163 """Construct a Butler instance from a configuration file. 

164 

165 Parameters 

166 ---------- 

167 config : `ButlerConfig` 

168 The configuration for this Butler instance. 

169 options : `ButlerInstanceOptions` 

170 Default values and other settings for the Butler instance. 

171 without_datastore : `bool`, optional 

172 If `True` do not attach a datastore to this butler. Any attempts 

173 to use a datastore will fail. 

174 

175 Notes 

176 ----- 

177 Most users should call the top-level `Butler`.``from_config`` 

178 instead of using this function directly. 

179 """ 

180 if "run" in config or "collection" in config: 

181 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

182 

183 defaults = RegistryDefaults( 

184 collections=options.collections, run=options.run, infer=options.inferDefaults, **options.kwargs 

185 ) 

186 try: 

187 butlerRoot = config.get("root", config.configDir) 

188 writeable = options.writeable 

189 if writeable is None: 

190 writeable = options.run is not None 

191 registry = _RegistryFactory(config).from_config( 

192 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

193 ) 

194 if without_datastore: 

195 datastore: Datastore = NullDatastore(None, None) 

196 else: 

197 datastore = Datastore.fromConfig( 

198 config, registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

199 ) 

200 # TODO: Once datastore drops dependency on registry we can 

201 # construct datastore first and pass opaque tables to registry 

202 # constructor. 

203 registry.make_datastore_tables(datastore.get_opaque_table_definitions()) 

204 storageClasses = StorageClassFactory() 

205 storageClasses.addFromConfig(config) 

206 

207 return DirectButler( 

208 config=config, registry=registry, datastore=datastore, storageClasses=storageClasses 

209 ) 

210 except Exception: 

211 # Failures here usually mean that configuration is incomplete, 

212 # just issue an error message which includes config file URI. 

213 _LOG.error(f"Failed to instantiate Butler from config {config.configFile}.") 

214 raise 

215 

216 def _clone( 

217 self, 

218 *, 

219 collections: Any = None, 

220 run: str | None = None, 

221 inferDefaults: bool = True, 

222 **kwargs: Any, 

223 ) -> DirectButler: 

224 # Docstring inherited 

225 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

226 registry = self._registry.copy(defaults) 

227 

228 return DirectButler( 

229 registry=registry, 

230 config=self._config, 

231 datastore=self._datastore.clone(registry.getDatastoreBridgeManager()), 

232 storageClasses=self.storageClasses, 

233 ) 

234 

235 GENERATION: ClassVar[int] = 3 

236 """This is a Generation 3 Butler. 

237 

238 This attribute may be removed in the future, once the Generation 2 Butler 

239 interface has been fully retired; it should only be used in transitional 

240 code. 

241 """ 

242 

243 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

244 """Return DatasetType defined in registry given dataset type name.""" 

245 try: 

246 return self.get_dataset_type(name) 

247 except MissingDatasetTypeError: 

248 return None 

249 

250 @classmethod 

251 def _unpickle( 

252 cls, 

253 config: ButlerConfig, 

254 collections: tuple[str, ...] | None, 

255 run: str | None, 

256 defaultDataId: dict[str, str], 

257 writeable: bool, 

258 ) -> DirectButler: 

259 """Callable used to unpickle a Butler. 

260 

261 We prefer not to use ``Butler.__init__`` directly so we can force some 

262 of its many arguments to be keyword-only (note that ``__reduce__`` 

263 can only invoke callables with positional arguments). 

264 

265 Parameters 

266 ---------- 

267 config : `ButlerConfig` 

268 Butler configuration, already coerced into a true `ButlerConfig` 

269 instance (and hence after any search paths for overrides have been 

270 utilized). 

271 collections : `tuple` [ `str` ] 

272 Names of the default collections to read from. 

273 run : `str`, optional 

274 Name of the default `~CollectionType.RUN` collection to write to. 

275 defaultDataId : `dict` [ `str`, `str` ] 

276 Default data ID values. 

277 writeable : `bool` 

278 Whether the Butler should support write operations. 

279 

280 Returns 

281 ------- 

282 butler : `Butler` 

283 A new `Butler` instance. 

284 """ 

285 return cls.create_from_config( 

286 config=config, 

287 options=ButlerInstanceOptions( 

288 collections=collections, run=run, writeable=writeable, kwargs=defaultDataId 

289 ), 

290 ) 

291 

292 def __reduce__(self) -> tuple: 

293 """Support pickling.""" 

294 return ( 

295 DirectButler._unpickle, 

296 ( 

297 self._config, 

298 self.collections, 

299 self.run, 

300 dict(self._registry.defaults.dataId.required), 

301 self._registry.isWriteable(), 

302 ), 

303 ) 

304 

305 def __str__(self) -> str: 

306 return ( 

307 f"Butler(collections={self.collections}, run={self.run}, " 

308 f"datastore='{self._datastore}', registry='{self._registry}')" 

309 ) 

310 

311 def isWriteable(self) -> bool: 

312 # Docstring inherited. 

313 return self._registry.isWriteable() 

314 

315 def _caching_context(self) -> contextlib.AbstractContextManager[None]: 

316 """Context manager that enables caching.""" 

317 return self._registry.caching_context() 

318 

319 @contextlib.contextmanager 

320 def transaction(self) -> Iterator[None]: 

321 """Context manager supporting `Butler` transactions. 

322 

323 Transactions can be nested. 

324 """ 

325 with self._registry.transaction(), self._datastore.transaction(): 

326 yield 

327 

328 def _standardizeArgs( 

329 self, 

330 datasetRefOrType: DatasetRef | DatasetType | str, 

331 dataId: DataId | None = None, 

332 for_put: bool = True, 

333 **kwargs: Any, 

334 ) -> tuple[DatasetType, DataId | None]: 

335 """Standardize the arguments passed to several Butler APIs. 

336 

337 Parameters 

338 ---------- 

339 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

340 When `DatasetRef` the `dataId` should be `None`. 

341 Otherwise the `DatasetType` or name thereof. 

342 dataId : `dict` or `DataCoordinate` 

343 A `dict` of `Dimension` link name, value pairs that label the 

344 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

345 should be provided as the second argument. 

346 for_put : `bool`, optional 

347 If `True` this call is invoked as part of a `Butler.put()`. 

348 Otherwise it is assumed to be part of a `Butler.get()`. This 

349 parameter is only relevant if there is dataset type 

350 inconsistency. 

351 **kwargs 

352 Additional keyword arguments used to augment or construct a 

353 `DataCoordinate`. See `DataCoordinate.standardize` 

354 parameters. 

355 

356 Returns 

357 ------- 

358 datasetType : `DatasetType` 

359 A `DatasetType` instance extracted from ``datasetRefOrType``. 

360 dataId : `dict` or `DataId`, optional 

361 Argument that can be used (along with ``kwargs``) to construct a 

362 `DataId`. 

363 

364 Notes 

365 ----- 

366 Butler APIs that conceptually need a DatasetRef also allow passing a 

367 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

368 keyword arguments that can be used to construct one) separately. This 

369 method accepts those arguments and always returns a true `DatasetType` 

370 and a `DataId` or `dict`. 

371 

372 Standardization of `dict` vs `DataId` is best handled by passing the 

373 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

374 generally similarly flexible. 

375 """ 

376 externalDatasetType: DatasetType | None = None 

377 internalDatasetType: DatasetType | None = None 

378 if isinstance(datasetRefOrType, DatasetRef): 

379 if dataId is not None or kwargs: 

380 raise ValueError("DatasetRef given, cannot use dataId as well") 

381 externalDatasetType = datasetRefOrType.datasetType 

382 dataId = datasetRefOrType.dataId 

383 else: 

384 # Don't check whether DataId is provided, because Registry APIs 

385 # can usually construct a better error message when it wasn't. 

386 if isinstance(datasetRefOrType, DatasetType): 

387 externalDatasetType = datasetRefOrType 

388 else: 

389 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

390 

391 # Check that they are self-consistent 

392 if externalDatasetType is not None: 

393 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

394 if externalDatasetType != internalDatasetType: 

395 # We can allow differences if they are compatible, depending 

396 # on whether this is a get or a put. A get requires that 

397 # the python type associated with the datastore can be 

398 # converted to the user type. A put requires that the user 

399 # supplied python type can be converted to the internal 

400 # type expected by registry. 

401 relevantDatasetType = internalDatasetType 

402 if for_put: 

403 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

404 else: 

405 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

406 relevantDatasetType = externalDatasetType 

407 if not is_compatible: 

408 raise ValueError( 

409 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

410 f"registry definition ({internalDatasetType})" 

411 ) 

412 # Override the internal definition. 

413 internalDatasetType = relevantDatasetType 

414 

415 assert internalDatasetType is not None 

416 return internalDatasetType, dataId 

417 

418 def _rewrite_data_id( 

419 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

420 ) -> tuple[DataId | None, dict[str, Any]]: 

421 """Rewrite a data ID taking into account dimension records. 

422 

423 Take a Data ID and keyword args and rewrite it if necessary to 

424 allow the user to specify dimension records rather than dimension 

425 primary values. 

426 

427 This allows a user to include a dataId dict with keys of 

428 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

429 the integer exposure ID. It also allows a string to be given 

430 for a dimension value rather than the integer ID if that is more 

431 convenient. For example, rather than having to specifying the 

432 detector with ``detector.full_name``, a string given for ``detector`` 

433 will be interpreted as the full name and converted to the integer 

434 value. 

435 

436 Keyword arguments can also use strings for dimensions like detector 

437 and exposure but python does not allow them to include ``.`` and 

438 so the ``exposure.day_obs`` syntax can not be used in a keyword 

439 argument. 

440 

441 Parameters 

442 ---------- 

443 dataId : `dict` or `DataCoordinate` 

444 A `dict` of `Dimension` link name, value pairs that will label the 

445 `DatasetRef` within a Collection. 

446 datasetType : `DatasetType` 

447 The dataset type associated with this dataId. Required to 

448 determine the relevant dimensions. 

449 **kwargs 

450 Additional keyword arguments used to augment or construct a 

451 `DataId`. See `DataId` parameters. 

452 

453 Returns 

454 ------- 

455 dataId : `dict` or `DataCoordinate` 

456 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

457 no keyword arguments, the original dataId will be returned 

458 unchanged. 

459 **kwargs : `dict` 

460 Any unused keyword arguments (would normally be empty dict). 

461 """ 

462 # Do nothing if we have a standalone DataCoordinate. 

463 if isinstance(dataId, DataCoordinate) and not kwargs: 

464 return dataId, kwargs 

465 

466 # Process dimension records that are using record information 

467 # rather than ids 

468 newDataId: dict[str, DataIdValue] = {} 

469 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

470 

471 # if all the dataId comes from keyword parameters we do not need 

472 # to do anything here because they can't be of the form 

473 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

474 if dataId: 

475 for k, v in dataId.items(): 

476 # If we have a Dimension we do not need to do anything 

477 # because it cannot be a compound key. 

478 if isinstance(k, str) and "." in k: 

479 # Someone is using a more human-readable dataId 

480 dimensionName, record = k.split(".", 1) 

481 byRecord[dimensionName][record] = v 

482 elif isinstance(k, Dimension): 

483 newDataId[k.name] = v 

484 else: 

485 newDataId[k] = v 

486 

487 # Go through the updated dataId and check the type in case someone is 

488 # using an alternate key. We have already filtered out the compound 

489 # keys dimensions.record format. 

490 not_dimensions = {} 

491 

492 # Will need to look in the dataId and the keyword arguments 

493 # and will remove them if they need to be fixed or are unrecognized. 

494 for dataIdDict in (newDataId, kwargs): 

495 # Use a list so we can adjust the dict safely in the loop 

496 for dimensionName in list(dataIdDict): 

497 value = dataIdDict[dimensionName] 

498 try: 

499 dimension = self.dimensions.dimensions[dimensionName] 

500 except KeyError: 

501 # This is not a real dimension 

502 not_dimensions[dimensionName] = value 

503 del dataIdDict[dimensionName] 

504 continue 

505 

506 # Convert an integral type to an explicit int to simplify 

507 # comparisons here 

508 if isinstance(value, numbers.Integral): 

509 value = int(value) 

510 

511 if not isinstance(value, dimension.primaryKey.getPythonType()): 

512 for alternate in dimension.alternateKeys: 

513 if isinstance(value, alternate.getPythonType()): 

514 byRecord[dimensionName][alternate.name] = value 

515 del dataIdDict[dimensionName] 

516 _LOG.debug( 

517 "Converting dimension %s to %s.%s=%s", 

518 dimensionName, 

519 dimensionName, 

520 alternate.name, 

521 value, 

522 ) 

523 break 

524 else: 

525 _LOG.warning( 

526 "Type mismatch found for value '%r' provided for dimension %s. " 

527 "Could not find matching alternative (primary key has type %s) " 

528 "so attempting to use as-is.", 

529 value, 

530 dimensionName, 

531 dimension.primaryKey.getPythonType(), 

532 ) 

533 

534 # By this point kwargs and newDataId should only include valid 

535 # dimensions. Merge kwargs in to the new dataId and log if there 

536 # are dimensions in both (rather than calling update). 

537 for k, v in kwargs.items(): 

538 if k in newDataId and newDataId[k] != v: 

539 _LOG.debug( 

540 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

541 ) 

542 newDataId[k] = v 

543 # No need to retain any values in kwargs now. 

544 kwargs = {} 

545 

546 # If we have some unrecognized dimensions we have to try to connect 

547 # them to records in other dimensions. This is made more complicated 

548 # by some dimensions having records with clashing names. A mitigation 

549 # is that we can tell by this point which dimensions are missing 

550 # for the DatasetType but this does not work for calibrations 

551 # where additional dimensions can be used to constrain the temporal 

552 # axis. 

553 if not_dimensions: 

554 # Search for all dimensions even if we have been given a value 

555 # explicitly. In some cases records are given as well as the 

556 # actually dimension and this should not be an error if they 

557 # match. 

558 mandatoryDimensions = datasetType.dimensions.names # - provided 

559 

560 candidateDimensions: set[str] = set() 

561 candidateDimensions.update(mandatoryDimensions) 

562 

563 # For calibrations we may well be needing temporal dimensions 

564 # so rather than always including all dimensions in the scan 

565 # restrict things a little. It is still possible for there 

566 # to be confusion over day_obs in visit vs exposure for example. 

567 # If we are not searching calibration collections things may 

568 # fail but they are going to fail anyway because of the 

569 # ambiguousness of the dataId... 

570 if datasetType.isCalibration(): 

571 for dim in self.dimensions.dimensions: 

572 if dim.temporal: 

573 candidateDimensions.add(str(dim)) 

574 

575 # Look up table for the first association with a dimension 

576 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

577 

578 # Keep track of whether an item is associated with multiple 

579 # dimensions. 

580 counter: Counter[str] = Counter() 

581 assigned: dict[str, set[str]] = defaultdict(set) 

582 

583 # Go through the missing dimensions and associate the 

584 # given names with records within those dimensions 

585 matched_dims = set() 

586 for dimensionName in candidateDimensions: 

587 dimension = self.dimensions.dimensions[dimensionName] 

588 fields = dimension.metadata.names | dimension.uniqueKeys.names 

589 for field in not_dimensions: 

590 if field in fields: 

591 guessedAssociation[dimensionName][field] = not_dimensions[field] 

592 counter[dimensionName] += 1 

593 assigned[field].add(dimensionName) 

594 matched_dims.add(field) 

595 

596 # Calculate the fields that matched nothing. 

597 never_found = set(not_dimensions) - matched_dims 

598 

599 if never_found: 

600 raise DimensionValueError(f"Unrecognized keyword args given: {never_found}") 

601 

602 # There is a chance we have allocated a single dataId item 

603 # to multiple dimensions. Need to decide which should be retained. 

604 # For now assume that the most popular alternative wins. 

605 # This means that day_obs with seq_num will result in 

606 # exposure.day_obs and not visit.day_obs 

607 # Also prefer an explicitly missing dimension over an inferred 

608 # temporal dimension. 

609 for fieldName, assignedDimensions in assigned.items(): 

610 if len(assignedDimensions) > 1: 

611 # Pick the most popular (preferring mandatory dimensions) 

612 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

613 if requiredButMissing: 

614 candidateDimensions = requiredButMissing 

615 else: 

616 candidateDimensions = assignedDimensions 

617 

618 # If this is a choice between visit and exposure and 

619 # neither was a required part of the dataset type, 

620 # (hence in this branch) always prefer exposure over 

621 # visit since exposures are always defined and visits 

622 # are defined from exposures. 

623 if candidateDimensions == {"exposure", "visit"}: 

624 candidateDimensions = {"exposure"} 

625 

626 # Select the relevant items and get a new restricted 

627 # counter. 

628 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

629 duplicatesCounter: Counter[str] = Counter() 

630 duplicatesCounter.update(theseCounts) 

631 

632 # Choose the most common. If they are equally common 

633 # we will pick the one that was found first. 

634 # Returns a list of tuples 

635 selected = duplicatesCounter.most_common(1)[0][0] 

636 

637 _LOG.debug( 

638 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

639 " Removed ambiguity by choosing dimension %s.", 

640 fieldName, 

641 ", ".join(assignedDimensions), 

642 selected, 

643 ) 

644 

645 for candidateDimension in assignedDimensions: 

646 if candidateDimension != selected: 

647 del guessedAssociation[candidateDimension][fieldName] 

648 

649 # Update the record look up dict with the new associations 

650 for dimensionName, values in guessedAssociation.items(): 

651 if values: # A dict might now be empty 

652 _LOG.debug( 

653 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

654 ) 

655 byRecord[dimensionName].update(values) 

656 

657 if byRecord: 

658 # Some record specifiers were found so we need to convert 

659 # them to the Id form 

660 for dimensionName, values in byRecord.items(): 

661 if dimensionName in newDataId: 

662 _LOG.debug( 

663 "DataId specified explicit %s dimension value of %s in addition to" 

664 " general record specifiers for it of %s. Ignoring record information.", 

665 dimensionName, 

666 newDataId[dimensionName], 

667 str(values), 

668 ) 

669 # Get the actual record and compare with these values. 

670 try: 

671 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

672 except DataIdError: 

673 raise DimensionValueError( 

674 f"Could not find dimension '{dimensionName}'" 

675 f" with dataId {newDataId} as part of comparing with" 

676 f" record values {byRecord[dimensionName]}" 

677 ) from None 

678 if len(recs) == 1: 

679 errmsg: list[str] = [] 

680 for k, v in values.items(): 

681 if (recval := getattr(recs[0], k)) != v: 

682 errmsg.append(f"{k}({recval} != {v})") 

683 if errmsg: 

684 raise DimensionValueError( 

685 f"Dimension {dimensionName} in dataId has explicit value" 

686 " inconsistent with records: " + ", ".join(errmsg) 

687 ) 

688 else: 

689 # Multiple matches for an explicit dimension 

690 # should never happen but let downstream complain. 

691 pass 

692 continue 

693 

694 # Build up a WHERE expression 

695 bind = dict(values.items()) 

696 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

697 

698 # Hopefully we get a single record that matches 

699 records = set( 

700 self._registry.queryDimensionRecords( 

701 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

702 ) 

703 ) 

704 

705 if len(records) != 1: 

706 if len(records) > 1: 

707 # visit can have an ambiguous answer without involving 

708 # visit_system. The default visit_system is defined 

709 # by the instrument. 

710 if ( 

711 dimensionName == "visit" 

712 and "visit_system_membership" in self.dimensions 

713 and "visit_system" in self.dimensions["instrument"].metadata 

714 ): 

715 instrument_records = list( 

716 self._registry.queryDimensionRecords( 

717 "instrument", 

718 dataId=newDataId, 

719 **kwargs, 

720 ) 

721 ) 

722 if len(instrument_records) == 1: 

723 visit_system = instrument_records[0].visit_system 

724 if visit_system is None: 

725 # Set to a value that will never match. 

726 visit_system = -1 

727 

728 # Look up each visit in the 

729 # visit_system_membership records. 

730 for rec in records: 

731 membership = list( 

732 self._registry.queryDimensionRecords( 

733 # Use bind to allow zero results. 

734 # This is a fully-specified query. 

735 "visit_system_membership", 

736 where="instrument = inst AND visit_system = system AND visit = v", 

737 bind=dict( 

738 inst=instrument_records[0].name, system=visit_system, v=rec.id 

739 ), 

740 ) 

741 ) 

742 if membership: 

743 # This record is the right answer. 

744 records = {rec} 

745 break 

746 

747 # The ambiguity may have been resolved so check again. 

748 if len(records) > 1: 

749 _LOG.debug( 

750 "Received %d records from constraints of %s", len(records), str(values) 

751 ) 

752 for r in records: 

753 _LOG.debug("- %s", str(r)) 

754 raise DimensionValueError( 

755 f"DataId specification for dimension {dimensionName} is not" 

756 f" uniquely constrained to a single dataset by {values}." 

757 f" Got {len(records)} results." 

758 ) 

759 else: 

760 raise DimensionValueError( 

761 f"DataId specification for dimension {dimensionName} matched no" 

762 f" records when constrained by {values}" 

763 ) 

764 

765 # Get the primary key from the real dimension object 

766 dimension = self.dimensions.dimensions[dimensionName] 

767 if not isinstance(dimension, Dimension): 

768 raise RuntimeError( 

769 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

770 ) 

771 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

772 

773 return newDataId, kwargs 

774 

775 def _findDatasetRef( 

776 self, 

777 datasetRefOrType: DatasetRef | DatasetType | str, 

778 dataId: DataId | None = None, 

779 *, 

780 collections: Any = None, 

781 predict: bool = False, 

782 run: str | None = None, 

783 datastore_records: bool = False, 

784 timespan: Timespan | None = None, 

785 **kwargs: Any, 

786 ) -> DatasetRef: 

787 """Shared logic for methods that start with a search for a dataset in 

788 the registry. 

789 

790 Parameters 

791 ---------- 

792 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

793 When `DatasetRef` the `dataId` should be `None`. 

794 Otherwise the `DatasetType` or name thereof. 

795 dataId : `dict` or `DataCoordinate`, optional 

796 A `dict` of `Dimension` link name, value pairs that label the 

797 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

798 should be provided as the first argument. 

799 collections : Any, optional 

800 Collections to be searched, overriding ``self.collections``. 

801 Can be any of the types supported by the ``collections`` argument 

802 to butler construction. 

803 predict : `bool`, optional 

804 If `True`, return a newly created `DatasetRef` with a unique 

805 dataset ID if finding a reference in the `Registry` fails. 

806 Defaults to `False`. 

807 run : `str`, optional 

808 Run collection name to use for creating `DatasetRef` for predicted 

809 datasets. Only used if ``predict`` is `True`. 

810 datastore_records : `bool`, optional 

811 If `True` add datastore records to returned `DatasetRef`. 

812 timespan : `Timespan` or `None`, optional 

813 A timespan that the validity range of the dataset must overlap. 

814 If not provided and this is a calibration dataset type, an attempt 

815 will be made to find the timespan from any temporal coordinate 

816 in the data ID. 

817 **kwargs 

818 Additional keyword arguments used to augment or construct a 

819 `DataId`. See `DataId` parameters. 

820 

821 Returns 

822 ------- 

823 ref : `DatasetRef` 

824 A reference to the dataset identified by the given arguments. 

825 This can be the same dataset reference as given if it was 

826 resolved. 

827 

828 Raises 

829 ------ 

830 LookupError 

831 Raised if no matching dataset exists in the `Registry` (and 

832 ``predict`` is `False`). 

833 ValueError 

834 Raised if a resolved `DatasetRef` was passed as an input, but it 

835 differs from the one found in the registry. 

836 TypeError 

837 Raised if no collections were provided. 

838 """ 

839 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

840 if isinstance(datasetRefOrType, DatasetRef): 

841 if collections is not None: 

842 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

843 # May need to retrieve datastore records if requested. 

844 if datastore_records and datasetRefOrType._datastore_records is None: 

845 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

846 return datasetRefOrType 

847 

848 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

849 

850 if datasetType.isCalibration(): 

851 # Because this is a calibration dataset, first try to make a 

852 # standardize the data ID without restricting the dimensions to 

853 # those of the dataset type requested, because there may be extra 

854 # dimensions that provide temporal information for a validity-range 

855 # lookup. 

856 dataId = DataCoordinate.standardize( 

857 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

858 ) 

859 if timespan is None: 

860 if dataId.dimensions.temporal: 

861 dataId = self._registry.expandDataId(dataId) 

862 # Use the timespan from the data ID to constrain the 

863 # calibration lookup, but only if the caller has not 

864 # specified an explicit timespan. 

865 timespan = dataId.timespan 

866 else: 

867 # Try an arbitrary timespan. Downstream will fail if this 

868 # results in more than one matching dataset. 

869 timespan = Timespan(None, None) 

870 else: 

871 # Standardize the data ID to just the dimensions of the dataset 

872 # type instead of letting registry.findDataset do it, so we get the 

873 # result even if no dataset is found. 

874 dataId = DataCoordinate.standardize( 

875 dataId, 

876 dimensions=datasetType.dimensions, 

877 defaults=self._registry.defaults.dataId, 

878 **kwargs, 

879 ) 

880 # Always lookup the DatasetRef, even if one is given, to ensure it is 

881 # present in the current collection. 

882 ref = self.find_dataset( 

883 datasetType, 

884 dataId, 

885 collections=collections, 

886 timespan=timespan, 

887 datastore_records=datastore_records, 

888 ) 

889 if ref is None: 

890 if predict: 

891 if run is None: 

892 run = self.run 

893 if run is None: 

894 raise TypeError("Cannot predict dataset ID/location with run=None.") 

895 return DatasetRef(datasetType, dataId, run=run) 

896 else: 

897 if collections is None: 

898 collections = self._registry.defaults.collections 

899 raise DatasetNotFoundError( 

900 f"Dataset {datasetType.name} with data ID {dataId} " 

901 f"could not be found in collections {collections}." 

902 ) 

903 if datasetType != ref.datasetType: 

904 # If they differ it is because the user explicitly specified 

905 # a compatible dataset type to this call rather than using the 

906 # registry definition. The DatasetRef must therefore be recreated 

907 # using the user definition such that the expected type is 

908 # returned. 

909 ref = DatasetRef( 

910 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

911 ) 

912 

913 return ref 

914 

915 @transactional 

916 def put( 

917 self, 

918 obj: Any, 

919 datasetRefOrType: DatasetRef | DatasetType | str, 

920 /, 

921 dataId: DataId | None = None, 

922 *, 

923 run: str | None = None, 

924 **kwargs: Any, 

925 ) -> DatasetRef: 

926 """Store and register a dataset. 

927 

928 Parameters 

929 ---------- 

930 obj : `object` 

931 The dataset. 

932 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

933 When `DatasetRef` is provided, ``dataId`` should be `None`. 

934 Otherwise the `DatasetType` or name thereof. If a fully resolved 

935 `DatasetRef` is given the run and ID are used directly. 

936 dataId : `dict` or `DataCoordinate` 

937 A `dict` of `Dimension` link name, value pairs that label the 

938 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

939 should be provided as the second argument. 

940 run : `str`, optional 

941 The name of the run the dataset should be added to, overriding 

942 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

943 **kwargs 

944 Additional keyword arguments used to augment or construct a 

945 `DataCoordinate`. See `DataCoordinate.standardize` 

946 parameters. Not used if a resolve `DatasetRef` is provided. 

947 

948 Returns 

949 ------- 

950 ref : `DatasetRef` 

951 A reference to the stored dataset, updated with the correct id if 

952 given. 

953 

954 Raises 

955 ------ 

956 TypeError 

957 Raised if the butler is read-only or if no run has been provided. 

958 """ 

959 if isinstance(datasetRefOrType, DatasetRef): 

960 # This is a direct put of predefined DatasetRef. 

961 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

962 if run is not None: 

963 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

964 # If registry already has a dataset with the same dataset ID, 

965 # dataset type and DataId, then _importDatasets will do nothing and 

966 # just return an original ref. We have to raise in this case, there 

967 # is a datastore check below for that. 

968 self._registry._importDatasets([datasetRefOrType], expand=True) 

969 # Before trying to write to the datastore check that it does not 

970 # know this dataset. This is prone to races, of course. 

971 if self._datastore.knows(datasetRefOrType): 

972 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

973 # Try to write dataset to the datastore, if it fails due to a race 

974 # with another write, the content of stored data may be 

975 # unpredictable. 

976 try: 

977 self._datastore.put(obj, datasetRefOrType) 

978 except IntegrityError as e: 

979 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

980 return datasetRefOrType 

981 

982 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

983 if not self.isWriteable(): 

984 raise TypeError("Butler is read-only.") 

985 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

986 

987 # Handle dimension records in dataId 

988 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

989 

990 # Add Registry Dataset entry. 

991 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs) 

992 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

993 self._datastore.put(obj, ref) 

994 

995 return ref 

996 

997 def getDeferred( 

998 self, 

999 datasetRefOrType: DatasetRef | DatasetType | str, 

1000 /, 

1001 dataId: DataId | None = None, 

1002 *, 

1003 parameters: dict | None = None, 

1004 collections: Any = None, 

1005 storageClass: str | StorageClass | None = None, 

1006 timespan: Timespan | None = None, 

1007 **kwargs: Any, 

1008 ) -> DeferredDatasetHandle: 

1009 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1010 after an immediate registry lookup. 

1011 

1012 Parameters 

1013 ---------- 

1014 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1015 When `DatasetRef` the `dataId` should be `None`. 

1016 Otherwise the `DatasetType` or name thereof. 

1017 dataId : `dict` or `DataCoordinate`, optional 

1018 A `dict` of `Dimension` link name, value pairs that label the 

1019 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1020 should be provided as the first argument. 

1021 parameters : `dict` 

1022 Additional StorageClass-defined options to control reading, 

1023 typically used to efficiently read only a subset of the dataset. 

1024 collections : Any, optional 

1025 Collections to be searched, overriding ``self.collections``. 

1026 Can be any of the types supported by the ``collections`` argument 

1027 to butler construction. 

1028 storageClass : `StorageClass` or `str`, optional 

1029 The storage class to be used to override the Python type 

1030 returned by this method. By default the returned type matches 

1031 the dataset type definition for this dataset. Specifying a 

1032 read `StorageClass` can force a different type to be returned. 

1033 This type must be compatible with the original type. 

1034 timespan : `Timespan` or `None`, optional 

1035 A timespan that the validity range of the dataset must overlap. 

1036 If not provided and this is a calibration dataset type, an attempt 

1037 will be made to find the timespan from any temporal coordinate 

1038 in the data ID. 

1039 **kwargs 

1040 Additional keyword arguments used to augment or construct a 

1041 `DataId`. See `DataId` parameters. 

1042 

1043 Returns 

1044 ------- 

1045 obj : `DeferredDatasetHandle` 

1046 A handle which can be used to retrieve a dataset at a later time. 

1047 

1048 Raises 

1049 ------ 

1050 LookupError 

1051 Raised if no matching dataset exists in the `Registry` or 

1052 datastore. 

1053 ValueError 

1054 Raised if a resolved `DatasetRef` was passed as an input, but it 

1055 differs from the one found in the registry. 

1056 TypeError 

1057 Raised if no collections were provided. 

1058 """ 

1059 if isinstance(datasetRefOrType, DatasetRef): 

1060 # Do the quick check first and if that fails, check for artifact 

1061 # existence. This is necessary for datastores that are configured 

1062 # in trust mode where there won't be a record but there will be 

1063 # a file. 

1064 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1065 ref = datasetRefOrType 

1066 else: 

1067 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1068 else: 

1069 ref = self._findDatasetRef( 

1070 datasetRefOrType, dataId, collections=collections, timespan=timespan, **kwargs 

1071 ) 

1072 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1073 

1074 def get( 

1075 self, 

1076 datasetRefOrType: DatasetRef | DatasetType | str, 

1077 /, 

1078 dataId: DataId | None = None, 

1079 *, 

1080 parameters: dict[str, Any] | None = None, 

1081 collections: Any = None, 

1082 storageClass: StorageClass | str | None = None, 

1083 timespan: Timespan | None = None, 

1084 **kwargs: Any, 

1085 ) -> Any: 

1086 """Retrieve a stored dataset. 

1087 

1088 Parameters 

1089 ---------- 

1090 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1091 When `DatasetRef` the `dataId` should be `None`. 

1092 Otherwise the `DatasetType` or name thereof. 

1093 If a resolved `DatasetRef`, the associated dataset 

1094 is returned directly without additional querying. 

1095 dataId : `dict` or `DataCoordinate` 

1096 A `dict` of `Dimension` link name, value pairs that label the 

1097 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1098 should be provided as the first argument. 

1099 parameters : `dict` 

1100 Additional StorageClass-defined options to control reading, 

1101 typically used to efficiently read only a subset of the dataset. 

1102 collections : Any, optional 

1103 Collections to be searched, overriding ``self.collections``. 

1104 Can be any of the types supported by the ``collections`` argument 

1105 to butler construction. 

1106 storageClass : `StorageClass` or `str`, optional 

1107 The storage class to be used to override the Python type 

1108 returned by this method. By default the returned type matches 

1109 the dataset type definition for this dataset. Specifying a 

1110 read `StorageClass` can force a different type to be returned. 

1111 This type must be compatible with the original type. 

1112 timespan : `Timespan` or `None`, optional 

1113 A timespan that the validity range of the dataset must overlap. 

1114 If not provided and this is a calibration dataset type, an attempt 

1115 will be made to find the timespan from any temporal coordinate 

1116 in the data ID. 

1117 **kwargs 

1118 Additional keyword arguments used to augment or construct a 

1119 `DataCoordinate`. See `DataCoordinate.standardize` 

1120 parameters. 

1121 

1122 Returns 

1123 ------- 

1124 obj : `object` 

1125 The dataset. 

1126 

1127 Raises 

1128 ------ 

1129 LookupError 

1130 Raised if no matching dataset exists in the `Registry`. 

1131 TypeError 

1132 Raised if no collections were provided. 

1133 

1134 Notes 

1135 ----- 

1136 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1137 this method requires that the given data ID include temporal dimensions 

1138 beyond the dimensions of the dataset type itself, in order to find the 

1139 dataset with the appropriate validity range. For example, a "bias" 

1140 dataset with native dimensions ``{instrument, detector}`` could be 

1141 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1142 ``exposure`` is a temporal dimension. 

1143 """ 

1144 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1145 ref = self._findDatasetRef( 

1146 datasetRefOrType, 

1147 dataId, 

1148 collections=collections, 

1149 datastore_records=True, 

1150 timespan=timespan, 

1151 **kwargs, 

1152 ) 

1153 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1154 

1155 def getURIs( 

1156 self, 

1157 datasetRefOrType: DatasetRef | DatasetType | str, 

1158 /, 

1159 dataId: DataId | None = None, 

1160 *, 

1161 predict: bool = False, 

1162 collections: Any = None, 

1163 run: str | None = None, 

1164 **kwargs: Any, 

1165 ) -> DatasetRefURIs: 

1166 """Return the URIs associated with the dataset. 

1167 

1168 Parameters 

1169 ---------- 

1170 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1171 When `DatasetRef` the `dataId` should be `None`. 

1172 Otherwise the `DatasetType` or name thereof. 

1173 dataId : `dict` or `DataCoordinate` 

1174 A `dict` of `Dimension` link name, value pairs that label the 

1175 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1176 should be provided as the first argument. 

1177 predict : `bool` 

1178 If `True`, allow URIs to be returned of datasets that have not 

1179 been written. 

1180 collections : Any, optional 

1181 Collections to be searched, overriding ``self.collections``. 

1182 Can be any of the types supported by the ``collections`` argument 

1183 to butler construction. 

1184 run : `str`, optional 

1185 Run to use for predictions, overriding ``self.run``. 

1186 **kwargs 

1187 Additional keyword arguments used to augment or construct a 

1188 `DataCoordinate`. See `DataCoordinate.standardize` 

1189 parameters. 

1190 

1191 Returns 

1192 ------- 

1193 uris : `DatasetRefURIs` 

1194 The URI to the primary artifact associated with this dataset (if 

1195 the dataset was disassembled within the datastore this may be 

1196 `None`), and the URIs to any components associated with the dataset 

1197 artifact. (can be empty if there are no components). 

1198 """ 

1199 ref = self._findDatasetRef( 

1200 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1201 ) 

1202 return self._datastore.getURIs(ref, predict) 

1203 

1204 def get_dataset_type(self, name: str) -> DatasetType: 

1205 return self._registry.getDatasetType(name) 

1206 

1207 def get_dataset( 

1208 self, 

1209 id: DatasetId, 

1210 *, 

1211 storage_class: str | StorageClass | None = None, 

1212 dimension_records: bool = False, 

1213 datastore_records: bool = False, 

1214 ) -> DatasetRef | None: 

1215 ref = self._registry.getDataset(id) 

1216 if ref is not None: 

1217 if dimension_records: 

1218 ref = ref.expanded( 

1219 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions) 

1220 ) 

1221 if storage_class: 

1222 ref = ref.overrideStorageClass(storage_class) 

1223 if datastore_records: 

1224 ref = self._registry.get_datastore_records(ref) 

1225 return ref 

1226 

1227 def find_dataset( 

1228 self, 

1229 dataset_type: DatasetType | str, 

1230 data_id: DataId | None = None, 

1231 *, 

1232 collections: str | Sequence[str] | None = None, 

1233 timespan: Timespan | None = None, 

1234 storage_class: str | StorageClass | None = None, 

1235 dimension_records: bool = False, 

1236 datastore_records: bool = False, 

1237 **kwargs: Any, 

1238 ) -> DatasetRef | None: 

1239 # Handle any parts of the dataID that are not using primary dimension 

1240 # keys. 

1241 if isinstance(dataset_type, str): 

1242 actual_type = self.get_dataset_type(dataset_type) 

1243 else: 

1244 actual_type = dataset_type 

1245 

1246 # Store the component for later. 

1247 component_name = actual_type.component() 

1248 if actual_type.isComponent(): 

1249 parent_type = actual_type.makeCompositeDatasetType() 

1250 else: 

1251 parent_type = actual_type 

1252 

1253 data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs) 

1254 

1255 ref = self._registry.findDataset( 

1256 parent_type, 

1257 data_id, 

1258 collections=collections, 

1259 timespan=timespan, 

1260 datastore_records=datastore_records, 

1261 **kwargs, 

1262 ) 

1263 if ref is not None and dimension_records: 

1264 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)) 

1265 if ref is not None and component_name: 

1266 ref = ref.makeComponentRef(component_name) 

1267 if ref is not None and storage_class is not None: 

1268 ref = ref.overrideStorageClass(storage_class) 

1269 

1270 return ref 

1271 

1272 def retrieveArtifacts( 

1273 self, 

1274 refs: Iterable[DatasetRef], 

1275 destination: ResourcePathExpression, 

1276 transfer: str = "auto", 

1277 preserve_path: bool = True, 

1278 overwrite: bool = False, 

1279 ) -> list[ResourcePath]: 

1280 # Docstring inherited. 

1281 return self._datastore.retrieveArtifacts( 

1282 refs, 

1283 ResourcePath(destination), 

1284 transfer=transfer, 

1285 preserve_path=preserve_path, 

1286 overwrite=overwrite, 

1287 ) 

1288 

1289 def exists( 

1290 self, 

1291 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1292 /, 

1293 data_id: DataId | None = None, 

1294 *, 

1295 full_check: bool = True, 

1296 collections: Any = None, 

1297 **kwargs: Any, 

1298 ) -> DatasetExistence: 

1299 # Docstring inherited. 

1300 existence = DatasetExistence.UNRECOGNIZED 

1301 

1302 if isinstance(dataset_ref_or_type, DatasetRef): 

1303 if collections is not None: 

1304 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1305 if data_id is not None: 

1306 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1307 ref = dataset_ref_or_type 

1308 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1309 if registry_ref is not None: 

1310 existence |= DatasetExistence.RECORDED 

1311 

1312 if dataset_ref_or_type != registry_ref: 

1313 # This could mean that storage classes differ, so we should 

1314 # check for that but use the registry ref for the rest of 

1315 # the method. 

1316 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1317 # Use the registry version from now on. 

1318 ref = registry_ref 

1319 else: 

1320 raise ValueError( 

1321 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1322 f"in registry but has different incompatible values ({registry_ref})." 

1323 ) 

1324 else: 

1325 try: 

1326 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1327 except (LookupError, TypeError): 

1328 return existence 

1329 existence |= DatasetExistence.RECORDED 

1330 

1331 if self._datastore.knows(ref): 

1332 existence |= DatasetExistence.DATASTORE 

1333 

1334 if full_check: 

1335 if self._datastore.exists(ref): 

1336 existence |= DatasetExistence._ARTIFACT 

1337 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1338 # Do not add this flag if we have no other idea about a dataset. 

1339 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1340 

1341 return existence 

1342 

1343 def _exists_many( 

1344 self, 

1345 refs: Iterable[DatasetRef], 

1346 /, 

1347 *, 

1348 full_check: bool = True, 

1349 ) -> dict[DatasetRef, DatasetExistence]: 

1350 # Docstring inherited. 

1351 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1352 

1353 # Registry does not have a bulk API to check for a ref. 

1354 for ref in refs: 

1355 registry_ref = self._registry.getDataset(ref.id) 

1356 if registry_ref is not None: 

1357 # It is possible, albeit unlikely, that the given ref does 

1358 # not match the one in registry even though the UUID matches. 

1359 # When checking a single ref we raise, but it's impolite to 

1360 # do that when potentially hundreds of refs are being checked. 

1361 # We could change the API to only accept UUIDs and that would 

1362 # remove the ability to even check and remove the worry 

1363 # about differing storage classes. Given the ongoing discussion 

1364 # on refs vs UUIDs and whether to raise or have a new 

1365 # private flag, treat this as a private API for now. 

1366 existence[ref] |= DatasetExistence.RECORDED 

1367 

1368 # Ask datastore if it knows about these refs. 

1369 knows = self._datastore.knows_these(refs) 

1370 for ref, known in knows.items(): 

1371 if known: 

1372 existence[ref] |= DatasetExistence.DATASTORE 

1373 

1374 if full_check: 

1375 mexists = self._datastore.mexists(refs) 

1376 for ref, exists in mexists.items(): 

1377 if exists: 

1378 existence[ref] |= DatasetExistence._ARTIFACT 

1379 else: 

1380 # Do not set this flag if nothing is known about the dataset. 

1381 for ref in existence: 

1382 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1383 existence[ref] |= DatasetExistence._ASSUMED 

1384 

1385 return existence 

1386 

1387 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1388 # Docstring inherited. 

1389 if not self.isWriteable(): 

1390 raise TypeError("Butler is read-only.") 

1391 names = list(names) 

1392 refs: list[DatasetRef] = [] 

1393 for name in names: 

1394 collectionType = self._registry.getCollectionType(name) 

1395 if collectionType is not CollectionType.RUN: 

1396 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1397 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1398 with self._datastore.transaction(), self._registry.transaction(): 

1399 if unstore: 

1400 self._datastore.trash(refs) 

1401 else: 

1402 self._datastore.forget(refs) 

1403 for name in names: 

1404 self._registry.removeCollection(name) 

1405 if unstore: 

1406 # Point of no return for removing artifacts 

1407 self._datastore.emptyTrash() 

1408 

1409 def pruneDatasets( 

1410 self, 

1411 refs: Iterable[DatasetRef], 

1412 *, 

1413 disassociate: bool = True, 

1414 unstore: bool = False, 

1415 tags: Iterable[str] = (), 

1416 purge: bool = False, 

1417 ) -> None: 

1418 # docstring inherited from LimitedButler 

1419 

1420 if not self.isWriteable(): 

1421 raise TypeError("Butler is read-only.") 

1422 if purge: 

1423 if not disassociate: 

1424 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1425 if not unstore: 

1426 raise TypeError("Cannot pass purge=True without unstore=True.") 

1427 elif disassociate: 

1428 tags = tuple(tags) 

1429 if not tags: 

1430 raise TypeError("No tags provided but disassociate=True.") 

1431 for tag in tags: 

1432 collectionType = self._registry.getCollectionType(tag) 

1433 if collectionType is not CollectionType.TAGGED: 

1434 raise TypeError( 

1435 f"Cannot disassociate from collection '{tag}' " 

1436 f"of non-TAGGED type {collectionType.name}." 

1437 ) 

1438 # Transform possibly-single-pass iterable into something we can iterate 

1439 # over multiple times. 

1440 refs = list(refs) 

1441 # Pruning a component of a DatasetRef makes no sense since registry 

1442 # doesn't know about components and datastore might not store 

1443 # components in a separate file 

1444 for ref in refs: 

1445 if ref.datasetType.component(): 

1446 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1447 # We don't need an unreliable Datastore transaction for this, because 

1448 # we've been extra careful to ensure that Datastore.trash only involves 

1449 # mutating the Registry (it can _look_ at Datastore-specific things, 

1450 # but shouldn't change them), and hence all operations here are 

1451 # Registry operations. 

1452 with self._datastore.transaction(), self._registry.transaction(): 

1453 if unstore: 

1454 self._datastore.trash(refs) 

1455 if purge: 

1456 self._registry.removeDatasets(refs) 

1457 elif disassociate: 

1458 assert tags, "Guaranteed by earlier logic in this function." 

1459 for tag in tags: 

1460 self._registry.disassociate(tag, refs) 

1461 # We've exited the Registry transaction, and apparently committed. 

1462 # (if there was an exception, everything rolled back, and it's as if 

1463 # nothing happened - and we never get here). 

1464 # Datastore artifacts are not yet gone, but they're clearly marked 

1465 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1466 # problems we can try again later, and if manual administrative 

1467 # intervention is required, it's pretty clear what that should entail: 

1468 # deleting everything on disk and in private Datastore tables that is 

1469 # in the dataset_location_trash table. 

1470 if unstore: 

1471 # Point of no return for removing artifacts 

1472 self._datastore.emptyTrash() 

1473 

1474 @transactional 

1475 def ingest( 

1476 self, 

1477 *datasets: FileDataset, 

1478 transfer: str | None = "auto", 

1479 record_validation_info: bool = True, 

1480 ) -> None: 

1481 # Docstring inherited. 

1482 if not self.isWriteable(): 

1483 raise TypeError("Butler is read-only.") 

1484 

1485 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1486 if not datasets: 

1487 return 

1488 

1489 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1490 

1491 # We need to reorganize all the inputs so that they are grouped 

1492 # by dataset type and run. Multiple refs in a single FileDataset 

1493 # are required to share the run and dataset type. 

1494 groupedData: MutableMapping[tuple[DatasetType, str], list[FileDataset]] = defaultdict(list) 

1495 

1496 # Track DataIDs that are being ingested so we can spot issues early 

1497 # with duplication. Retain previous FileDataset so we can report it. 

1498 groupedDataIds: MutableMapping[tuple[DatasetType, str], dict[DataCoordinate, FileDataset]] = ( 

1499 defaultdict(dict) 

1500 ) 

1501 

1502 # And the nested loop that populates it: 

1503 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1504 # Somewhere to store pre-existing refs if we have an 

1505 # execution butler. 

1506 existingRefs: list[DatasetRef] = [] 

1507 

1508 for ref in dataset.refs: 

1509 group_key = (ref.datasetType, ref.run) 

1510 

1511 if ref.dataId in groupedDataIds[group_key]: 

1512 raise ConflictingDefinitionError( 

1513 f"Ingest conflict. Dataset {dataset.path} has same" 

1514 " DataId as other ingest dataset" 

1515 f" {groupedDataIds[group_key][ref.dataId].path} " 

1516 f" ({ref.dataId})" 

1517 ) 

1518 

1519 groupedDataIds[group_key][ref.dataId] = dataset 

1520 

1521 if existingRefs: 

1522 if len(dataset.refs) != len(existingRefs): 

1523 # Keeping track of partially pre-existing datasets is hard 

1524 # and should generally never happen. For now don't allow 

1525 # it. 

1526 raise ConflictingDefinitionError( 

1527 f"For dataset {dataset.path} some dataIds already exist" 

1528 " in registry but others do not. This is not supported." 

1529 ) 

1530 

1531 # Store expanded form in the original FileDataset. 

1532 dataset.refs = existingRefs 

1533 else: 

1534 groupedData[group_key].append(dataset) 

1535 

1536 # Now we can bulk-insert into Registry for each DatasetType. 

1537 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1538 groupedData.items(), desc="Bulk-inserting datasets by type" 

1539 ): 

1540 refs_to_import = [] 

1541 for dataset in grouped_datasets: 

1542 refs_to_import.extend(dataset.refs) 

1543 

1544 n_refs = len(refs_to_import) 

1545 _LOG.verbose( 

1546 "Importing %d ref%s of dataset type %r into run %r", 

1547 n_refs, 

1548 "" if n_refs == 1 else "s", 

1549 datasetType.name, 

1550 this_run, 

1551 ) 

1552 

1553 # Import the refs and expand the DataCoordinates since we can't 

1554 # guarantee that they are expanded and Datastore will need 

1555 # the records. 

1556 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1557 assert set(imported_refs) == set(refs_to_import) 

1558 

1559 # Replace all the refs in the FileDataset with expanded versions. 

1560 # Pull them off in the order we put them on the list. 

1561 for dataset in grouped_datasets: 

1562 n_dataset_refs = len(dataset.refs) 

1563 dataset.refs = imported_refs[:n_dataset_refs] 

1564 del imported_refs[:n_dataset_refs] 

1565 

1566 # Bulk-insert everything into Datastore. 

1567 # We do not know if any of the registry entries already existed 

1568 # (_importDatasets only complains if they exist but differ) so 

1569 # we have to catch IntegrityError explicitly. 

1570 try: 

1571 self._datastore.ingest( 

1572 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1573 ) 

1574 except IntegrityError as e: 

1575 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1576 

1577 @contextlib.contextmanager 

1578 def export( 

1579 self, 

1580 *, 

1581 directory: str | None = None, 

1582 filename: str | None = None, 

1583 format: str | None = None, 

1584 transfer: str | None = None, 

1585 ) -> Iterator[RepoExportContext]: 

1586 # Docstring inherited. 

1587 if directory is None and transfer is not None: 

1588 raise TypeError("Cannot transfer without providing a directory.") 

1589 if transfer == "move": 

1590 raise TypeError("Transfer may not be 'move': export is read-only") 

1591 if format is None: 

1592 if filename is None: 

1593 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1594 else: 

1595 _, format = os.path.splitext(filename) 

1596 if not format: 

1597 raise ValueError("Please specify a file extension to determine export format.") 

1598 format = format[1:] # Strip leading "."" 

1599 elif filename is None: 

1600 filename = f"export.{format}" 

1601 if directory is not None: 

1602 filename = os.path.join(directory, filename) 

1603 formats = self._config["repo_transfer_formats"] 

1604 if format not in formats: 

1605 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1606 BackendClass = get_class_of(formats[format, "export"]) 

1607 with open(filename, "w") as stream: 

1608 backend = BackendClass(stream, universe=self.dimensions) 

1609 try: 

1610 helper = RepoExportContext( 

1611 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1612 ) 

1613 with self._caching_context(): 

1614 yield helper 

1615 except BaseException: 

1616 raise 

1617 else: 

1618 helper._finish() 

1619 

1620 def import_( 

1621 self, 

1622 *, 

1623 directory: ResourcePathExpression | None = None, 

1624 filename: ResourcePathExpression | TextIO | None = None, 

1625 format: str | None = None, 

1626 transfer: str | None = None, 

1627 skip_dimensions: set | None = None, 

1628 ) -> None: 

1629 # Docstring inherited. 

1630 if not self.isWriteable(): 

1631 raise TypeError("Butler is read-only.") 

1632 if format is None: 

1633 if filename is None: 

1634 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1635 else: 

1636 _, format = os.path.splitext(filename) # type: ignore 

1637 elif filename is None: 

1638 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1639 if directory is not None: 

1640 directory = ResourcePath(directory, forceDirectory=True) 

1641 # mypy doesn't think this will work but it does in python >= 3.10. 

1642 if isinstance(filename, ResourcePathExpression): # type: ignore 

1643 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1644 if not filename.isabs() and directory is not None: 

1645 potential = directory.join(filename) 

1646 exists_in_cwd = filename.exists() 

1647 exists_in_dir = potential.exists() 

1648 if exists_in_cwd and exists_in_dir: 

1649 _LOG.warning( 

1650 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1651 "Additionally, the file exists relative to the given search directory (%s). " 

1652 "Using the export file in the given directory.", 

1653 filename, 

1654 potential, 

1655 ) 

1656 # Given they specified an explicit directory and that 

1657 # directory has the export file in it, assume that that 

1658 # is what was meant despite the file in cwd. 

1659 filename = potential 

1660 elif exists_in_dir: 

1661 filename = potential 

1662 elif not exists_in_cwd and not exists_in_dir: 

1663 # Raise early. 

1664 raise FileNotFoundError( 

1665 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1666 ) 

1667 BackendClass: type[RepoImportBackend] = get_class_of( 

1668 self._config["repo_transfer_formats"][format]["import"] 

1669 ) 

1670 

1671 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1672 with self._caching_context(): 

1673 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1674 backend.register() 

1675 with self.transaction(): 

1676 backend.load( 

1677 self._datastore, 

1678 directory=directory, 

1679 transfer=transfer, 

1680 skip_dimensions=skip_dimensions, 

1681 ) 

1682 

1683 if isinstance(filename, ResourcePath): 

1684 # We can not use open() here at the moment because of 

1685 # DM-38589 since yaml does stream.read(8192) in a loop. 

1686 stream = io.StringIO(filename.read().decode()) 

1687 doImport(stream) 

1688 else: 

1689 doImport(filename) # type: ignore 

1690 

1691 def transfer_dimension_records_from( 

1692 self, source_butler: LimitedButler | Butler, source_refs: Iterable[DatasetRef] 

1693 ) -> None: 

1694 # Allowed dimensions in the target butler. 

1695 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1696 

1697 data_ids = {ref.dataId for ref in source_refs} 

1698 

1699 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1700 source_butler, data_ids, elements 

1701 ) 

1702 

1703 # Insert order is important. 

1704 for element in self.dimensions.sorted(dimension_records.keys()): 

1705 records = [r for r in dimension_records[element].values()] 

1706 # Assume that if the record is already present that we can 

1707 # use it without having to check that the record metadata 

1708 # is consistent. 

1709 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1710 _LOG.debug("Dimension '%s' -- number of records transferred: %d", element.name, len(records)) 

1711 

1712 def _extract_all_dimension_records_from_data_ids( 

1713 self, 

1714 source_butler: LimitedButler | Butler, 

1715 data_ids: set[DataCoordinate], 

1716 allowed_elements: frozenset[DimensionElement], 

1717 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1718 primary_records = self._extract_dimension_records_from_data_ids( 

1719 source_butler, data_ids, allowed_elements 

1720 ) 

1721 

1722 can_query = True if isinstance(source_butler, Butler) else False 

1723 

1724 additional_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1725 for original_element, record_mapping in primary_records.items(): 

1726 # Get dimensions that depend on this dimension. 

1727 populated_by = self.dimensions.get_elements_populated_by( 

1728 self.dimensions[original_element.name] # type: ignore 

1729 ) 

1730 

1731 for data_id in record_mapping.keys(): 

1732 for element in populated_by: 

1733 if element not in allowed_elements: 

1734 continue 

1735 if element.name == original_element.name: 

1736 continue 

1737 

1738 if element.name in primary_records: 

1739 # If this element has already been stored avoid 

1740 # re-finding records since that may lead to additional 

1741 # spurious records. e.g. visit is populated_by 

1742 # visit_detector_region but querying 

1743 # visit_detector_region by visit will return all the 

1744 # detectors for this visit -- the visit dataId does not 

1745 # constrain this. 

1746 # To constrain the query the original dataIds would 

1747 # have to be scanned. 

1748 continue 

1749 

1750 if not can_query: 

1751 raise RuntimeError( 

1752 f"Transferring populated_by records like {element.name} requires a full Butler." 

1753 ) 

1754 

1755 records = source_butler.registry.queryDimensionRecords( # type: ignore 

1756 element.name, 

1757 **data_id.mapping, # type: ignore 

1758 ) 

1759 for record in records: 

1760 additional_records[record.definition].setdefault(record.dataId, record) 

1761 

1762 # The next step is to walk back through the additional records to 

1763 # pick up any missing content (such as visit_definition needing to 

1764 # know the exposure). Want to ensure we do not request records we 

1765 # already have. 

1766 missing_data_ids = set() 

1767 for name, record_mapping in additional_records.items(): 

1768 for data_id in record_mapping.keys(): 

1769 if data_id not in primary_records[name]: 

1770 missing_data_ids.add(data_id) 

1771 

1772 # Fill out the new records. Assume that these new records do not 

1773 # also need to carry over additional populated_by records. 

1774 secondary_records = self._extract_dimension_records_from_data_ids( 

1775 source_butler, missing_data_ids, allowed_elements 

1776 ) 

1777 

1778 # Merge the extra sets of records in with the original. 

1779 for name, record_mapping in itertools.chain(additional_records.items(), secondary_records.items()): 

1780 primary_records[name].update(record_mapping) 

1781 

1782 return primary_records 

1783 

1784 def _extract_dimension_records_from_data_ids( 

1785 self, 

1786 source_butler: LimitedButler | Butler, 

1787 data_ids: set[DataCoordinate], 

1788 allowed_elements: frozenset[DimensionElement], 

1789 ) -> dict[DimensionElement, dict[DataCoordinate, DimensionRecord]]: 

1790 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1791 

1792 for data_id in data_ids: 

1793 # Need an expanded record, if not expanded that we need a full 

1794 # butler with registry (allow mocks with registry too). 

1795 if not data_id.hasRecords(): 

1796 if registry := getattr(source_butler, "registry", None): 

1797 data_id = registry.expandDataId(data_id) 

1798 else: 

1799 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1800 # If this butler doesn't know about a dimension in the source 

1801 # butler things will break later. 

1802 for element_name in data_id.dimensions.elements: 

1803 record = data_id.records[element_name] 

1804 if record is not None and record.definition in allowed_elements: 

1805 dimension_records[record.definition].setdefault(record.dataId, record) 

1806 

1807 return dimension_records 

1808 

1809 def transfer_from( 

1810 self, 

1811 source_butler: LimitedButler, 

1812 source_refs: Iterable[DatasetRef], 

1813 transfer: str = "auto", 

1814 skip_missing: bool = True, 

1815 register_dataset_types: bool = False, 

1816 transfer_dimensions: bool = False, 

1817 dry_run: bool = False, 

1818 ) -> collections.abc.Collection[DatasetRef]: 

1819 # Docstring inherited. 

1820 if not self.isWriteable(): 

1821 raise TypeError("Butler is read-only.") 

1822 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1823 

1824 # Will iterate through the refs multiple times so need to convert 

1825 # to a list if this isn't a collection. 

1826 if not isinstance(source_refs, collections.abc.Collection): 

1827 source_refs = list(source_refs) 

1828 

1829 original_count = len(source_refs) 

1830 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

1831 

1832 # In some situations the datastore artifact may be missing 

1833 # and we do not want that registry entry to be imported. 

1834 # Asking datastore is not sufficient, the records may have been 

1835 # purged, we have to ask for the (predicted) URI and check 

1836 # existence explicitly. Execution butler is set up exactly like 

1837 # this with no datastore records. 

1838 artifact_existence: dict[ResourcePath, bool] = {} 

1839 if skip_missing: 

1840 dataset_existence = source_butler._datastore.mexists( 

1841 source_refs, artifact_existence=artifact_existence 

1842 ) 

1843 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1844 filtered_count = len(source_refs) 

1845 n_missing = original_count - filtered_count 

1846 _LOG.verbose( 

1847 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

1848 n_missing, 

1849 "" if n_missing == 1 else "s", 

1850 filtered_count, 

1851 ) 

1852 

1853 # Importing requires that we group the refs by dataset type and run 

1854 # before doing the import. 

1855 source_dataset_types = set() 

1856 grouped_refs = defaultdict(list) 

1857 for ref in source_refs: 

1858 grouped_refs[ref.datasetType, ref.run].append(ref) 

1859 source_dataset_types.add(ref.datasetType) 

1860 

1861 # Check to see if the dataset type in the source butler has 

1862 # the same definition in the target butler and register missing 

1863 # ones if requested. Registration must happen outside a transaction. 

1864 newly_registered_dataset_types = set() 

1865 for datasetType in source_dataset_types: 

1866 if register_dataset_types: 

1867 # Let this raise immediately if inconsistent. Continuing 

1868 # on to find additional inconsistent dataset types 

1869 # might result in additional unwanted dataset types being 

1870 # registered. 

1871 if self._registry.registerDatasetType(datasetType): 

1872 newly_registered_dataset_types.add(datasetType) 

1873 else: 

1874 # If the dataset type is missing, let it fail immediately. 

1875 target_dataset_type = self.get_dataset_type(datasetType.name) 

1876 if target_dataset_type != datasetType: 

1877 raise ConflictingDefinitionError( 

1878 "Source butler dataset type differs from definition" 

1879 f" in target butler: {datasetType} !=" 

1880 f" {target_dataset_type}" 

1881 ) 

1882 if newly_registered_dataset_types: 

1883 # We may have registered some even if there were inconsistencies 

1884 # but should let people know (or else remove them again). 

1885 _LOG.verbose( 

1886 "Registered the following dataset types in the target Butler: %s", 

1887 ", ".join(d.name for d in newly_registered_dataset_types), 

1888 ) 

1889 else: 

1890 _LOG.verbose("All required dataset types are known to the target Butler") 

1891 

1892 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1893 if transfer_dimensions: 

1894 # Collect all the dimension records for these refs. 

1895 # All dimensions are to be copied but the list of valid dimensions 

1896 # come from this butler's universe. 

1897 elements = frozenset(element for element in self.dimensions.elements if element.has_own_table) 

1898 dataIds = {ref.dataId for ref in source_refs} 

1899 dimension_records = self._extract_all_dimension_records_from_data_ids( 

1900 source_butler, dataIds, elements 

1901 ) 

1902 

1903 handled_collections: set[str] = set() 

1904 

1905 # Do all the importing in a single transaction. 

1906 with self.transaction(): 

1907 if dimension_records and not dry_run: 

1908 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

1909 # Order matters. 

1910 for element in self.dimensions.sorted(dimension_records.keys()): 

1911 records = [r for r in dimension_records[element].values()] 

1912 # Assume that if the record is already present that we can 

1913 # use it without having to check that the record metadata 

1914 # is consistent. 

1915 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1916 

1917 n_imported = 0 

1918 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

1919 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

1920 ): 

1921 if run not in handled_collections: 

1922 # May need to create output collection. If source butler 

1923 # has a registry, ask for documentation string. 

1924 run_doc = None 

1925 if registry := getattr(source_butler, "registry", None): 

1926 run_doc = registry.getCollectionDocumentation(run) 

1927 if not dry_run: 

1928 registered = self._registry.registerRun(run, doc=run_doc) 

1929 else: 

1930 registered = True 

1931 handled_collections.add(run) 

1932 if registered: 

1933 _LOG.verbose("Creating output run %s", run) 

1934 

1935 n_refs = len(refs_to_import) 

1936 _LOG.verbose( 

1937 "Importing %d ref%s of dataset type %s into run %s", 

1938 n_refs, 

1939 "" if n_refs == 1 else "s", 

1940 datasetType.name, 

1941 run, 

1942 ) 

1943 

1944 # Assume we are using UUIDs and the source refs will match 

1945 # those imported. 

1946 if not dry_run: 

1947 imported_refs = self._registry._importDatasets(refs_to_import) 

1948 else: 

1949 imported_refs = refs_to_import 

1950 assert set(imported_refs) == set(refs_to_import) 

1951 n_imported += len(imported_refs) 

1952 

1953 assert len(source_refs) == n_imported 

1954 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

1955 

1956 # Ask the datastore to transfer. The datastore has to check that 

1957 # the source datastore is compatible with the target datastore. 

1958 accepted, rejected = self._datastore.transfer_from( 

1959 source_butler._datastore, 

1960 source_refs, 

1961 transfer=transfer, 

1962 artifact_existence=artifact_existence, 

1963 dry_run=dry_run, 

1964 ) 

1965 if rejected: 

1966 # For now, accept the registry entries but not the files. 

1967 _LOG.warning( 

1968 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

1969 len(rejected), 

1970 len(accepted), 

1971 datasetType, 

1972 run, 

1973 ) 

1974 

1975 return source_refs 

1976 

1977 def validateConfiguration( 

1978 self, 

1979 logFailures: bool = False, 

1980 datasetTypeNames: Iterable[str] | None = None, 

1981 ignore: Iterable[str] | None = None, 

1982 ) -> None: 

1983 # Docstring inherited. 

1984 if datasetTypeNames: 

1985 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

1986 else: 

1987 datasetTypes = list(self._registry.queryDatasetTypes()) 

1988 

1989 # filter out anything from the ignore list 

1990 if ignore: 

1991 ignore = set(ignore) 

1992 datasetTypes = [ 

1993 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

1994 ] 

1995 else: 

1996 ignore = set() 

1997 

1998 # For each datasetType that has an instrument dimension, create 

1999 # a DatasetRef for each defined instrument 

2000 datasetRefs = [] 

2001 

2002 # Find all the registered instruments (if "instrument" is in the 

2003 # universe). 

2004 if "instrument" in self.dimensions: 

2005 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2006 

2007 for datasetType in datasetTypes: 

2008 if "instrument" in datasetType.dimensions: 

2009 # In order to create a conforming dataset ref, create 

2010 # fake DataCoordinate values for the non-instrument 

2011 # dimensions. The type of the value does not matter here. 

2012 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"} 

2013 

2014 for instrument in instruments: 

2015 datasetRef = DatasetRef( 

2016 datasetType, 

2017 DataCoordinate.standardize( 

2018 dataId, instrument=instrument, dimensions=datasetType.dimensions 

2019 ), 

2020 run="validate", 

2021 ) 

2022 datasetRefs.append(datasetRef) 

2023 

2024 entities: list[DatasetType | DatasetRef] = [] 

2025 entities.extend(datasetTypes) 

2026 entities.extend(datasetRefs) 

2027 

2028 datastoreErrorStr = None 

2029 try: 

2030 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2031 except ValidationError as e: 

2032 datastoreErrorStr = str(e) 

2033 

2034 # Also check that the LookupKeys used by the datastores match 

2035 # registry and storage class definitions 

2036 keys = self._datastore.getLookupKeys() 

2037 

2038 failedNames = set() 

2039 failedDataId = set() 

2040 for key in keys: 

2041 if key.name is not None: 

2042 if key.name in ignore: 

2043 continue 

2044 

2045 # skip if specific datasetType names were requested and this 

2046 # name does not match 

2047 if datasetTypeNames and key.name not in datasetTypeNames: 

2048 continue 

2049 

2050 # See if it is a StorageClass or a DatasetType 

2051 if key.name in self.storageClasses: 

2052 pass 

2053 else: 

2054 try: 

2055 self.get_dataset_type(key.name) 

2056 except KeyError: 

2057 if logFailures: 

2058 _LOG.critical( 

2059 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2060 ) 

2061 failedNames.add(key) 

2062 else: 

2063 # Dimensions are checked for consistency when the Butler 

2064 # is created and rendezvoused with a universe. 

2065 pass 

2066 

2067 # Check that the instrument is a valid instrument 

2068 # Currently only support instrument so check for that 

2069 if key.dataId: 

2070 dataIdKeys = set(key.dataId) 

2071 if {"instrument"} != dataIdKeys: 

2072 if logFailures: 

2073 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2074 failedDataId.add(key) 

2075 elif key.dataId["instrument"] not in instruments: 

2076 if logFailures: 

2077 _LOG.critical("Key '%s' has unknown instrument", key) 

2078 failedDataId.add(key) 

2079 

2080 messages = [] 

2081 

2082 if datastoreErrorStr: 

2083 messages.append(datastoreErrorStr) 

2084 

2085 for failed, msg in ( 

2086 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2087 (failedDataId, "Keys with bad DataId entries: "), 

2088 ): 

2089 if failed: 

2090 msg += ", ".join(str(k) for k in failed) 

2091 messages.append(msg) 

2092 

2093 if messages: 

2094 raise ValidationError(";\n".join(messages)) 

2095 

2096 @property 

2097 def collections(self) -> Sequence[str]: 

2098 """The collections to search by default, in order 

2099 (`~collections.abc.Sequence` [ `str` ]). 

2100 

2101 This is an alias for ``self.registry.defaults.collections``. It cannot 

2102 be set directly in isolation, but all defaults may be changed together 

2103 by assigning a new `RegistryDefaults` instance to 

2104 ``self.registry.defaults``. 

2105 """ 

2106 return self._registry.defaults.collections 

2107 

2108 @property 

2109 def run(self) -> str | None: 

2110 """Name of the run this butler writes outputs to by default (`str` or 

2111 `None`). 

2112 

2113 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2114 directly in isolation, but all defaults may be changed together by 

2115 assigning a new `RegistryDefaults` instance to 

2116 ``self.registry.defaults``. 

2117 """ 

2118 return self._registry.defaults.run 

2119 

2120 @property 

2121 def registry(self) -> Registry: 

2122 """The object that manages dataset metadata and relationships 

2123 (`Registry`). 

2124 

2125 Many operations that don't involve reading or writing butler datasets 

2126 are accessible only via `Registry` methods. Eventually these methods 

2127 will be replaced by equivalent `Butler` methods. 

2128 """ 

2129 return self._registry_shim 

2130 

2131 @property 

2132 def dimensions(self) -> DimensionUniverse: 

2133 # Docstring inherited. 

2134 return self._registry.dimensions 

2135 

2136 @contextlib.contextmanager 

2137 def _query(self) -> Iterator[Query]: 

2138 # Docstring inherited. 

2139 raise NotImplementedError("TODO DM-41159") 

2140 

2141 def _preload_cache(self) -> None: 

2142 """Immediately load caches that are used for common operations.""" 

2143 self._registry.preload_cache() 

2144 

2145 def prepend_collection_chain( 

2146 self, parent_collection_name: str, child_collection_names: str | Iterable[str] 

2147 ) -> None: 

2148 return self._registry._managers.collections.prepend_collection_chain( 

2149 parent_collection_name, list(ensure_iterable(child_collection_names)) 

2150 ) 

2151 

2152 _config: ButlerConfig 

2153 """Configuration for this Butler instance.""" 

2154 

2155 _registry: SqlRegistry 

2156 """The object that manages dataset metadata and relationships 

2157 (`SqlRegistry`). 

2158 

2159 Most operations that don't involve reading or writing butler datasets are 

2160 accessible only via `SqlRegistry` methods. 

2161 """ 

2162 

2163 datastore: Datastore 

2164 """The object that manages actual dataset storage (`Datastore`). 

2165 

2166 Direct user access to the datastore should rarely be necessary; the primary 

2167 exception is the case where a `Datastore` implementation provides extra 

2168 functionality beyond what the base class defines. 

2169 """ 

2170 

2171 storageClasses: StorageClassFactory 

2172 """An object that maps known storage class names to objects that fully 

2173 describe them (`StorageClassFactory`). 

2174 """ 

2175 

2176 _registry_shim: RegistryShim 

2177 """Shim object to provide a legacy public interface for querying via the 

2178 the ``registry`` property. 

2179 """