Coverage for python/lsst/daf/butler/direct_butler.py: 11%

715 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-01 11:00 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import logging 

41import numbers 

42import os 

43import warnings 

44from collections import Counter, defaultdict 

45from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

46from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

47 

48from deprecated.sphinx import deprecated 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils.introspection import get_class_of 

51from lsst.utils.logging import VERBOSE, getLogger 

52from sqlalchemy.exc import IntegrityError 

53 

54from ._butler import Butler 

55from ._butler_config import ButlerConfig 

56from ._config import Config 

57from ._dataset_existence import DatasetExistence 

58from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef 

59from ._dataset_type import DatasetType 

60from ._deferredDatasetHandle import DeferredDatasetHandle 

61from ._exceptions import ValidationError 

62from ._file_dataset import FileDataset 

63from ._limited_butler import LimitedButler 

64from ._registry_shim import RegistryShim 

65from ._storage_class import StorageClass, StorageClassFactory 

66from ._timespan import Timespan 

67from .datastore import DatasetRefURIs, Datastore, NullDatastore 

68from .dimensions import ( 

69 DataCoordinate, 

70 DataId, 

71 DataIdValue, 

72 Dimension, 

73 DimensionElement, 

74 DimensionRecord, 

75 DimensionUniverse, 

76) 

77from .progress import Progress 

78from .registry import ( 

79 CollectionType, 

80 ConflictingDefinitionError, 

81 DataIdError, 

82 MissingDatasetTypeError, 

83 NoDefaultCollectionError, 

84 Registry, 

85 RegistryDefaults, 

86 _RegistryFactory, 

87) 

88from .registry.sql_registry import SqlRegistry 

89from .transfers import RepoExportContext 

90from .utils import transactional 

91 

92if TYPE_CHECKING: 

93 from lsst.resources import ResourceHandleProtocol 

94 

95 from .transfers import RepoImportBackend 

96 

97_LOG = getLogger(__name__) 

98 

99 

100class ButlerValidationError(ValidationError): 

101 """There is a problem with the Butler configuration.""" 

102 

103 pass 

104 

105 

106class DirectButler(Butler): 

107 """Main entry point for the data access system. 

108 

109 Parameters 

110 ---------- 

111 config : `ButlerConfig`, `Config` or `str`, optional. 

112 Configuration. Anything acceptable to the 

113 `ButlerConfig` constructor. If a directory path 

114 is given the configuration will be read from a ``butler.yaml`` file in 

115 that location. If `None` is given default values will be used. 

116 butler : `DirectButler`, optional. 

117 If provided, construct a new Butler that uses the same registry and 

118 datastore as the given one, but with the given collection and run. 

119 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

120 arguments. 

121 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

122 An expression specifying the collections to be searched (in order) when 

123 reading datasets. 

124 This may be a `str` collection name or an iterable thereof. 

125 See :ref:`daf_butler_collection_expressions` for more information. 

126 These collections are not registered automatically and must be 

127 manually registered before they are used by any method, but they may be 

128 manually registered after the `Butler` is initialized. 

129 run : `str`, optional 

130 Name of the `~CollectionType.RUN` collection new datasets should be 

131 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

132 ``collections`` will be set to ``[run]``. If not `None`, this 

133 collection will automatically be registered. If this is not set (and 

134 ``writeable`` is not set either), a read-only butler will be created. 

135 searchPaths : `list` of `str`, optional 

136 Directory paths to search when calculating the full Butler 

137 configuration. Not used if the supplied config is already a 

138 `ButlerConfig`. 

139 writeable : `bool`, optional 

140 Explicitly sets whether the butler supports write operations. If not 

141 provided, a read-write butler is created if any of ``run``, ``tags``, 

142 or ``chains`` is non-empty. 

143 inferDefaults : `bool`, optional 

144 If `True` (default) infer default data ID values from the values 

145 present in the datasets in ``collections``: if all collections have the 

146 same value (or no value) for a governor dimension, that value will be 

147 the default for that dimension. Nonexistent collections are ignored. 

148 If a default value is provided explicitly for a governor dimension via 

149 ``**kwargs``, no default will be inferred for that dimension. 

150 without_datastore : `bool`, optional 

151 If `True` do not attach a datastore to this butler. Any attempts 

152 to use a datastore will fail. 

153 **kwargs : `str` 

154 Default data ID key-value pairs. These may only identify "governor" 

155 dimensions like ``instrument`` and ``skymap``. 

156 """ 

157 

158 def __init__( 

159 self, 

160 config: Config | ResourcePathExpression | None = None, 

161 *, 

162 butler: DirectButler | None = None, 

163 collections: Any = None, 

164 run: str | None = None, 

165 searchPaths: Sequence[ResourcePathExpression] | None = None, 

166 writeable: bool | None = None, 

167 inferDefaults: bool = True, 

168 without_datastore: bool = False, 

169 **kwargs: str, 

170 ): 

171 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

172 # Load registry, datastore, etc. from config or existing butler. 

173 if butler is not None: 

174 if config is not None or searchPaths is not None or writeable is not None: 

175 raise TypeError( 

176 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

177 ) 

178 self._registry = butler._registry.copy(defaults) 

179 self._datastore = butler._datastore 

180 self.storageClasses = butler.storageClasses 

181 self._config: ButlerConfig = butler._config 

182 else: 

183 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore) 

184 try: 

185 butlerRoot = self._config.get("root", self._config.configDir) 

186 if writeable is None: 

187 writeable = run is not None 

188 self._registry = _RegistryFactory(self._config).from_config( 

189 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

190 ) 

191 if without_datastore: 

192 self._datastore = NullDatastore(None, None) 

193 else: 

194 self._datastore = Datastore.fromConfig( 

195 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

196 ) 

197 # TODO: Once datastore drops dependency on registry we can 

198 # construct datastore first and pass opaque tables to registry 

199 # constructor. 

200 self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions()) 

201 self.storageClasses = StorageClassFactory() 

202 self.storageClasses.addFromConfig(self._config) 

203 except Exception: 

204 # Failures here usually mean that configuration is incomplete, 

205 # just issue an error message which includes config file URI. 

206 _LOG.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

207 raise 

208 

209 # For execution butler the datastore needs a special 

210 # dependency-inversion trick. This is not used by regular butler, 

211 # but we do not have a way to distinguish regular butler from execution 

212 # butler. 

213 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

214 

215 if "run" in self._config or "collection" in self._config: 

216 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

217 

218 self._registry_shim = RegistryShim(self) 

219 

220 GENERATION: ClassVar[int] = 3 

221 """This is a Generation 3 Butler. 

222 

223 This attribute may be removed in the future, once the Generation 2 Butler 

224 interface has been fully retired; it should only be used in transitional 

225 code. 

226 """ 

227 

228 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

229 """Return DatasetType defined in registry given dataset type name.""" 

230 try: 

231 return self.get_dataset_type(name) 

232 except MissingDatasetTypeError: 

233 return None 

234 

235 @classmethod 

236 def _unpickle( 

237 cls, 

238 config: ButlerConfig, 

239 collections: tuple[str, ...] | None, 

240 run: str | None, 

241 defaultDataId: dict[str, str], 

242 writeable: bool, 

243 ) -> DirectButler: 

244 """Callable used to unpickle a Butler. 

245 

246 We prefer not to use ``Butler.__init__`` directly so we can force some 

247 of its many arguments to be keyword-only (note that ``__reduce__`` 

248 can only invoke callables with positional arguments). 

249 

250 Parameters 

251 ---------- 

252 config : `ButlerConfig` 

253 Butler configuration, already coerced into a true `ButlerConfig` 

254 instance (and hence after any search paths for overrides have been 

255 utilized). 

256 collections : `tuple` [ `str` ] 

257 Names of the default collections to read from. 

258 run : `str`, optional 

259 Name of the default `~CollectionType.RUN` collection to write to. 

260 defaultDataId : `dict` [ `str`, `str` ] 

261 Default data ID values. 

262 writeable : `bool` 

263 Whether the Butler should support write operations. 

264 

265 Returns 

266 ------- 

267 butler : `Butler` 

268 A new `Butler` instance. 

269 """ 

270 # MyPy doesn't recognize that the kwargs below are totally valid; it 

271 # seems to think '**defaultDataId* is a _positional_ argument! 

272 return cls( 

273 config=config, 

274 collections=collections, 

275 run=run, 

276 writeable=writeable, 

277 **defaultDataId, # type: ignore 

278 ) 

279 

280 def __reduce__(self) -> tuple: 

281 """Support pickling.""" 

282 return ( 

283 DirectButler._unpickle, 

284 ( 

285 self._config, 

286 self.collections, 

287 self.run, 

288 dict(self._registry.defaults.dataId.required), 

289 self._registry.isWriteable(), 

290 ), 

291 ) 

292 

293 def __str__(self) -> str: 

294 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

295 self.collections, self.run, self._datastore, self._registry 

296 ) 

297 

298 def isWriteable(self) -> bool: 

299 # Docstring inherited. 

300 return self._registry.isWriteable() 

301 

302 def _caching_context(self) -> contextlib.AbstractContextManager[None]: 

303 """Context manager that enables caching.""" 

304 return self._registry.caching_context() 

305 

306 @contextlib.contextmanager 

307 def transaction(self) -> Iterator[None]: 

308 """Context manager supporting `Butler` transactions. 

309 

310 Transactions can be nested. 

311 """ 

312 with self._registry.transaction(), self._datastore.transaction(): 

313 yield 

314 

315 def _standardizeArgs( 

316 self, 

317 datasetRefOrType: DatasetRef | DatasetType | str, 

318 dataId: DataId | None = None, 

319 for_put: bool = True, 

320 **kwargs: Any, 

321 ) -> tuple[DatasetType, DataId | None]: 

322 """Standardize the arguments passed to several Butler APIs. 

323 

324 Parameters 

325 ---------- 

326 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

327 When `DatasetRef` the `dataId` should be `None`. 

328 Otherwise the `DatasetType` or name thereof. 

329 dataId : `dict` or `DataCoordinate` 

330 A `dict` of `Dimension` link name, value pairs that label the 

331 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

332 should be provided as the second argument. 

333 for_put : `bool`, optional 

334 If `True` this call is invoked as part of a `Butler.put()`. 

335 Otherwise it is assumed to be part of a `Butler.get()`. This 

336 parameter is only relevant if there is dataset type 

337 inconsistency. 

338 **kwargs 

339 Additional keyword arguments used to augment or construct a 

340 `DataCoordinate`. See `DataCoordinate.standardize` 

341 parameters. 

342 

343 Returns 

344 ------- 

345 datasetType : `DatasetType` 

346 A `DatasetType` instance extracted from ``datasetRefOrType``. 

347 dataId : `dict` or `DataId`, optional 

348 Argument that can be used (along with ``kwargs``) to construct a 

349 `DataId`. 

350 

351 Notes 

352 ----- 

353 Butler APIs that conceptually need a DatasetRef also allow passing a 

354 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

355 keyword arguments that can be used to construct one) separately. This 

356 method accepts those arguments and always returns a true `DatasetType` 

357 and a `DataId` or `dict`. 

358 

359 Standardization of `dict` vs `DataId` is best handled by passing the 

360 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

361 generally similarly flexible. 

362 """ 

363 externalDatasetType: DatasetType | None = None 

364 internalDatasetType: DatasetType | None = None 

365 if isinstance(datasetRefOrType, DatasetRef): 

366 if dataId is not None or kwargs: 

367 raise ValueError("DatasetRef given, cannot use dataId as well") 

368 externalDatasetType = datasetRefOrType.datasetType 

369 dataId = datasetRefOrType.dataId 

370 else: 

371 # Don't check whether DataId is provided, because Registry APIs 

372 # can usually construct a better error message when it wasn't. 

373 if isinstance(datasetRefOrType, DatasetType): 

374 externalDatasetType = datasetRefOrType 

375 else: 

376 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

377 

378 # Check that they are self-consistent 

379 if externalDatasetType is not None: 

380 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

381 if externalDatasetType != internalDatasetType: 

382 # We can allow differences if they are compatible, depending 

383 # on whether this is a get or a put. A get requires that 

384 # the python type associated with the datastore can be 

385 # converted to the user type. A put requires that the user 

386 # supplied python type can be converted to the internal 

387 # type expected by registry. 

388 relevantDatasetType = internalDatasetType 

389 if for_put: 

390 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

391 else: 

392 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

393 relevantDatasetType = externalDatasetType 

394 if not is_compatible: 

395 raise ValueError( 

396 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

397 f"registry definition ({internalDatasetType})" 

398 ) 

399 # Override the internal definition. 

400 internalDatasetType = relevantDatasetType 

401 

402 assert internalDatasetType is not None 

403 return internalDatasetType, dataId 

404 

405 def _rewrite_data_id( 

406 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

407 ) -> tuple[DataId | None, dict[str, Any]]: 

408 """Rewrite a data ID taking into account dimension records. 

409 

410 Take a Data ID and keyword args and rewrite it if necessary to 

411 allow the user to specify dimension records rather than dimension 

412 primary values. 

413 

414 This allows a user to include a dataId dict with keys of 

415 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

416 the integer exposure ID. It also allows a string to be given 

417 for a dimension value rather than the integer ID if that is more 

418 convenient. For example, rather than having to specifying the 

419 detector with ``detector.full_name``, a string given for ``detector`` 

420 will be interpreted as the full name and converted to the integer 

421 value. 

422 

423 Keyword arguments can also use strings for dimensions like detector 

424 and exposure but python does not allow them to include ``.`` and 

425 so the ``exposure.day_obs`` syntax can not be used in a keyword 

426 argument. 

427 

428 Parameters 

429 ---------- 

430 dataId : `dict` or `DataCoordinate` 

431 A `dict` of `Dimension` link name, value pairs that will label the 

432 `DatasetRef` within a Collection. 

433 datasetType : `DatasetType` 

434 The dataset type associated with this dataId. Required to 

435 determine the relevant dimensions. 

436 **kwargs 

437 Additional keyword arguments used to augment or construct a 

438 `DataId`. See `DataId` parameters. 

439 

440 Returns 

441 ------- 

442 dataId : `dict` or `DataCoordinate` 

443 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

444 no keyword arguments, the original dataId will be returned 

445 unchanged. 

446 **kwargs : `dict` 

447 Any unused keyword arguments (would normally be empty dict). 

448 """ 

449 # Do nothing if we have a standalone DataCoordinate. 

450 if isinstance(dataId, DataCoordinate) and not kwargs: 

451 return dataId, kwargs 

452 

453 # Process dimension records that are using record information 

454 # rather than ids 

455 newDataId: dict[str, DataIdValue] = {} 

456 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

457 

458 # if all the dataId comes from keyword parameters we do not need 

459 # to do anything here because they can't be of the form 

460 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

461 if dataId: 

462 for k, v in dataId.items(): 

463 # If we have a Dimension we do not need to do anything 

464 # because it cannot be a compound key. 

465 if isinstance(k, str) and "." in k: 

466 # Someone is using a more human-readable dataId 

467 dimensionName, record = k.split(".", 1) 

468 byRecord[dimensionName][record] = v 

469 elif isinstance(k, Dimension): 

470 newDataId[k.name] = v 

471 else: 

472 newDataId[k] = v 

473 

474 # Go through the updated dataId and check the type in case someone is 

475 # using an alternate key. We have already filtered out the compound 

476 # keys dimensions.record format. 

477 not_dimensions = {} 

478 

479 # Will need to look in the dataId and the keyword arguments 

480 # and will remove them if they need to be fixed or are unrecognized. 

481 for dataIdDict in (newDataId, kwargs): 

482 # Use a list so we can adjust the dict safely in the loop 

483 for dimensionName in list(dataIdDict): 

484 value = dataIdDict[dimensionName] 

485 try: 

486 dimension = self.dimensions.dimensions[dimensionName] 

487 except KeyError: 

488 # This is not a real dimension 

489 not_dimensions[dimensionName] = value 

490 del dataIdDict[dimensionName] 

491 continue 

492 

493 # Convert an integral type to an explicit int to simplify 

494 # comparisons here 

495 if isinstance(value, numbers.Integral): 

496 value = int(value) 

497 

498 if not isinstance(value, dimension.primaryKey.getPythonType()): 

499 for alternate in dimension.alternateKeys: 

500 if isinstance(value, alternate.getPythonType()): 

501 byRecord[dimensionName][alternate.name] = value 

502 del dataIdDict[dimensionName] 

503 _LOG.debug( 

504 "Converting dimension %s to %s.%s=%s", 

505 dimensionName, 

506 dimensionName, 

507 alternate.name, 

508 value, 

509 ) 

510 break 

511 else: 

512 _LOG.warning( 

513 "Type mismatch found for value '%r' provided for dimension %s. " 

514 "Could not find matching alternative (primary key has type %s) " 

515 "so attempting to use as-is.", 

516 value, 

517 dimensionName, 

518 dimension.primaryKey.getPythonType(), 

519 ) 

520 

521 # By this point kwargs and newDataId should only include valid 

522 # dimensions. Merge kwargs in to the new dataId and log if there 

523 # are dimensions in both (rather than calling update). 

524 for k, v in kwargs.items(): 

525 if k in newDataId and newDataId[k] != v: 

526 _LOG.debug( 

527 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

528 ) 

529 newDataId[k] = v 

530 # No need to retain any values in kwargs now. 

531 kwargs = {} 

532 

533 # If we have some unrecognized dimensions we have to try to connect 

534 # them to records in other dimensions. This is made more complicated 

535 # by some dimensions having records with clashing names. A mitigation 

536 # is that we can tell by this point which dimensions are missing 

537 # for the DatasetType but this does not work for calibrations 

538 # where additional dimensions can be used to constrain the temporal 

539 # axis. 

540 if not_dimensions: 

541 # Search for all dimensions even if we have been given a value 

542 # explicitly. In some cases records are given as well as the 

543 # actually dimension and this should not be an error if they 

544 # match. 

545 mandatoryDimensions = datasetType.dimensions.names # - provided 

546 

547 candidateDimensions: set[str] = set() 

548 candidateDimensions.update(mandatoryDimensions) 

549 

550 # For calibrations we may well be needing temporal dimensions 

551 # so rather than always including all dimensions in the scan 

552 # restrict things a little. It is still possible for there 

553 # to be confusion over day_obs in visit vs exposure for example. 

554 # If we are not searching calibration collections things may 

555 # fail but they are going to fail anyway because of the 

556 # ambiguousness of the dataId... 

557 if datasetType.isCalibration(): 

558 for dim in self.dimensions.dimensions: 

559 if dim.temporal: 

560 candidateDimensions.add(str(dim)) 

561 

562 # Look up table for the first association with a dimension 

563 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

564 

565 # Keep track of whether an item is associated with multiple 

566 # dimensions. 

567 counter: Counter[str] = Counter() 

568 assigned: dict[str, set[str]] = defaultdict(set) 

569 

570 # Go through the missing dimensions and associate the 

571 # given names with records within those dimensions 

572 matched_dims = set() 

573 for dimensionName in candidateDimensions: 

574 dimension = self.dimensions.dimensions[dimensionName] 

575 fields = dimension.metadata.names | dimension.uniqueKeys.names 

576 for field in not_dimensions: 

577 if field in fields: 

578 guessedAssociation[dimensionName][field] = not_dimensions[field] 

579 counter[dimensionName] += 1 

580 assigned[field].add(dimensionName) 

581 matched_dims.add(field) 

582 

583 # Calculate the fields that matched nothing. 

584 never_found = set(not_dimensions) - matched_dims 

585 

586 if never_found: 

587 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

588 

589 # There is a chance we have allocated a single dataId item 

590 # to multiple dimensions. Need to decide which should be retained. 

591 # For now assume that the most popular alternative wins. 

592 # This means that day_obs with seq_num will result in 

593 # exposure.day_obs and not visit.day_obs 

594 # Also prefer an explicitly missing dimension over an inferred 

595 # temporal dimension. 

596 for fieldName, assignedDimensions in assigned.items(): 

597 if len(assignedDimensions) > 1: 

598 # Pick the most popular (preferring mandatory dimensions) 

599 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

600 if requiredButMissing: 

601 candidateDimensions = requiredButMissing 

602 else: 

603 candidateDimensions = assignedDimensions 

604 

605 # If this is a choice between visit and exposure and 

606 # neither was a required part of the dataset type, 

607 # (hence in this branch) always prefer exposure over 

608 # visit since exposures are always defined and visits 

609 # are defined from exposures. 

610 if candidateDimensions == {"exposure", "visit"}: 

611 candidateDimensions = {"exposure"} 

612 

613 # Select the relevant items and get a new restricted 

614 # counter. 

615 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

616 duplicatesCounter: Counter[str] = Counter() 

617 duplicatesCounter.update(theseCounts) 

618 

619 # Choose the most common. If they are equally common 

620 # we will pick the one that was found first. 

621 # Returns a list of tuples 

622 selected = duplicatesCounter.most_common(1)[0][0] 

623 

624 _LOG.debug( 

625 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

626 " Removed ambiguity by choosing dimension %s.", 

627 fieldName, 

628 ", ".join(assignedDimensions), 

629 selected, 

630 ) 

631 

632 for candidateDimension in assignedDimensions: 

633 if candidateDimension != selected: 

634 del guessedAssociation[candidateDimension][fieldName] 

635 

636 # Update the record look up dict with the new associations 

637 for dimensionName, values in guessedAssociation.items(): 

638 if values: # A dict might now be empty 

639 _LOG.debug( 

640 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

641 ) 

642 byRecord[dimensionName].update(values) 

643 

644 if byRecord: 

645 # Some record specifiers were found so we need to convert 

646 # them to the Id form 

647 for dimensionName, values in byRecord.items(): 

648 if dimensionName in newDataId: 

649 _LOG.debug( 

650 "DataId specified explicit %s dimension value of %s in addition to" 

651 " general record specifiers for it of %s. Ignoring record information.", 

652 dimensionName, 

653 newDataId[dimensionName], 

654 str(values), 

655 ) 

656 # Get the actual record and compare with these values. 

657 try: 

658 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

659 except DataIdError: 

660 raise ValueError( 

661 f"Could not find dimension '{dimensionName}'" 

662 f" with dataId {newDataId} as part of comparing with" 

663 f" record values {byRecord[dimensionName]}" 

664 ) from None 

665 if len(recs) == 1: 

666 errmsg: list[str] = [] 

667 for k, v in values.items(): 

668 if (recval := getattr(recs[0], k)) != v: 

669 errmsg.append(f"{k}({recval} != {v})") 

670 if errmsg: 

671 raise ValueError( 

672 f"Dimension {dimensionName} in dataId has explicit value" 

673 " inconsistent with records: " + ", ".join(errmsg) 

674 ) 

675 else: 

676 # Multiple matches for an explicit dimension 

677 # should never happen but let downstream complain. 

678 pass 

679 continue 

680 

681 # Build up a WHERE expression 

682 bind = dict(values.items()) 

683 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

684 

685 # Hopefully we get a single record that matches 

686 records = set( 

687 self._registry.queryDimensionRecords( 

688 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

689 ) 

690 ) 

691 

692 if len(records) != 1: 

693 if len(records) > 1: 

694 # visit can have an ambiguous answer without involving 

695 # visit_system. The default visit_system is defined 

696 # by the instrument. 

697 if ( 

698 dimensionName == "visit" 

699 and "visit_system_membership" in self.dimensions 

700 and "visit_system" in self.dimensions["instrument"].metadata 

701 ): 

702 instrument_records = list( 

703 self._registry.queryDimensionRecords( 

704 "instrument", 

705 dataId=newDataId, 

706 **kwargs, 

707 ) 

708 ) 

709 if len(instrument_records) == 1: 

710 visit_system = instrument_records[0].visit_system 

711 if visit_system is None: 

712 # Set to a value that will never match. 

713 visit_system = -1 

714 

715 # Look up each visit in the 

716 # visit_system_membership records. 

717 for rec in records: 

718 membership = list( 

719 self._registry.queryDimensionRecords( 

720 # Use bind to allow zero results. 

721 # This is a fully-specified query. 

722 "visit_system_membership", 

723 where="instrument = inst AND visit_system = system AND visit = v", 

724 bind=dict( 

725 inst=instrument_records[0].name, system=visit_system, v=rec.id 

726 ), 

727 ) 

728 ) 

729 if membership: 

730 # This record is the right answer. 

731 records = {rec} 

732 break 

733 

734 # The ambiguity may have been resolved so check again. 

735 if len(records) > 1: 

736 _LOG.debug( 

737 "Received %d records from constraints of %s", len(records), str(values) 

738 ) 

739 for r in records: 

740 _LOG.debug("- %s", str(r)) 

741 raise ValueError( 

742 f"DataId specification for dimension {dimensionName} is not" 

743 f" uniquely constrained to a single dataset by {values}." 

744 f" Got {len(records)} results." 

745 ) 

746 else: 

747 raise ValueError( 

748 f"DataId specification for dimension {dimensionName} matched no" 

749 f" records when constrained by {values}" 

750 ) 

751 

752 # Get the primary key from the real dimension object 

753 dimension = self.dimensions.dimensions[dimensionName] 

754 if not isinstance(dimension, Dimension): 

755 raise RuntimeError( 

756 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

757 ) 

758 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

759 

760 return newDataId, kwargs 

761 

762 def _findDatasetRef( 

763 self, 

764 datasetRefOrType: DatasetRef | DatasetType | str, 

765 dataId: DataId | None = None, 

766 *, 

767 collections: Any = None, 

768 predict: bool = False, 

769 run: str | None = None, 

770 datastore_records: bool = False, 

771 **kwargs: Any, 

772 ) -> DatasetRef: 

773 """Shared logic for methods that start with a search for a dataset in 

774 the registry. 

775 

776 Parameters 

777 ---------- 

778 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

779 When `DatasetRef` the `dataId` should be `None`. 

780 Otherwise the `DatasetType` or name thereof. 

781 dataId : `dict` or `DataCoordinate`, optional 

782 A `dict` of `Dimension` link name, value pairs that label the 

783 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

784 should be provided as the first argument. 

785 collections : Any, optional 

786 Collections to be searched, overriding ``self.collections``. 

787 Can be any of the types supported by the ``collections`` argument 

788 to butler construction. 

789 predict : `bool`, optional 

790 If `True`, return a newly created `DatasetRef` with a unique 

791 dataset ID if finding a reference in the `Registry` fails. 

792 Defaults to `False`. 

793 run : `str`, optional 

794 Run collection name to use for creating `DatasetRef` for predicted 

795 datasets. Only used if ``predict`` is `True`. 

796 datastore_records : `bool`, optional 

797 If `True` add datastore records to returned `DatasetRef`. 

798 **kwargs 

799 Additional keyword arguments used to augment or construct a 

800 `DataId`. See `DataId` parameters. 

801 

802 Returns 

803 ------- 

804 ref : `DatasetRef` 

805 A reference to the dataset identified by the given arguments. 

806 This can be the same dataset reference as given if it was 

807 resolved. 

808 

809 Raises 

810 ------ 

811 LookupError 

812 Raised if no matching dataset exists in the `Registry` (and 

813 ``predict`` is `False`). 

814 ValueError 

815 Raised if a resolved `DatasetRef` was passed as an input, but it 

816 differs from the one found in the registry. 

817 TypeError 

818 Raised if no collections were provided. 

819 """ 

820 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

821 if isinstance(datasetRefOrType, DatasetRef): 

822 if collections is not None: 

823 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

824 # May need to retrieve datastore records if requested. 

825 if datastore_records and datasetRefOrType._datastore_records is None: 

826 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

827 return datasetRefOrType 

828 timespan: Timespan | None = None 

829 

830 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

831 

832 if datasetType.isCalibration(): 

833 # Because this is a calibration dataset, first try to make a 

834 # standardize the data ID without restricting the dimensions to 

835 # those of the dataset type requested, because there may be extra 

836 # dimensions that provide temporal information for a validity-range 

837 # lookup. 

838 dataId = DataCoordinate.standardize( 

839 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

840 ) 

841 if dataId.dimensions.temporal: 

842 dataId = self._registry.expandDataId(dataId) 

843 timespan = dataId.timespan 

844 else: 

845 # Standardize the data ID to just the dimensions of the dataset 

846 # type instead of letting registry.findDataset do it, so we get the 

847 # result even if no dataset is found. 

848 dataId = DataCoordinate.standardize( 

849 dataId, 

850 dimensions=datasetType.dimensions, 

851 defaults=self._registry.defaults.dataId, 

852 **kwargs, 

853 ) 

854 # Always lookup the DatasetRef, even if one is given, to ensure it is 

855 # present in the current collection. 

856 ref = self.find_dataset( 

857 datasetType, 

858 dataId, 

859 collections=collections, 

860 timespan=timespan, 

861 datastore_records=datastore_records, 

862 ) 

863 if ref is None: 

864 if predict: 

865 if run is None: 

866 run = self.run 

867 if run is None: 

868 raise TypeError("Cannot predict dataset ID/location with run=None.") 

869 return DatasetRef(datasetType, dataId, run=run) 

870 else: 

871 if collections is None: 

872 collections = self._registry.defaults.collections 

873 raise LookupError( 

874 f"Dataset {datasetType.name} with data ID {dataId} " 

875 f"could not be found in collections {collections}." 

876 ) 

877 if datasetType != ref.datasetType: 

878 # If they differ it is because the user explicitly specified 

879 # a compatible dataset type to this call rather than using the 

880 # registry definition. The DatasetRef must therefore be recreated 

881 # using the user definition such that the expected type is 

882 # returned. 

883 ref = DatasetRef( 

884 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

885 ) 

886 

887 return ref 

888 

889 # TODO: remove on DM-40067. 

890 @transactional 

891 @deprecated( 

892 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

893 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

894 " were relying on the run parameter to determine the run." 

895 " Will be removed after v26.0.", 

896 version="v26.0", 

897 category=FutureWarning, 

898 ) 

899 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

900 # Docstring inherited. 

901 return self.put(obj, ref) 

902 

903 @transactional 

904 def put( 

905 self, 

906 obj: Any, 

907 datasetRefOrType: DatasetRef | DatasetType | str, 

908 /, 

909 dataId: DataId | None = None, 

910 *, 

911 run: str | None = None, 

912 **kwargs: Any, 

913 ) -> DatasetRef: 

914 """Store and register a dataset. 

915 

916 Parameters 

917 ---------- 

918 obj : `object` 

919 The dataset. 

920 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

921 When `DatasetRef` is provided, ``dataId`` should be `None`. 

922 Otherwise the `DatasetType` or name thereof. If a fully resolved 

923 `DatasetRef` is given the run and ID are used directly. 

924 dataId : `dict` or `DataCoordinate` 

925 A `dict` of `Dimension` link name, value pairs that label the 

926 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

927 should be provided as the second argument. 

928 run : `str`, optional 

929 The name of the run the dataset should be added to, overriding 

930 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

931 **kwargs 

932 Additional keyword arguments used to augment or construct a 

933 `DataCoordinate`. See `DataCoordinate.standardize` 

934 parameters. Not used if a resolve `DatasetRef` is provided. 

935 

936 Returns 

937 ------- 

938 ref : `DatasetRef` 

939 A reference to the stored dataset, updated with the correct id if 

940 given. 

941 

942 Raises 

943 ------ 

944 TypeError 

945 Raised if the butler is read-only or if no run has been provided. 

946 """ 

947 if isinstance(datasetRefOrType, DatasetRef): 

948 # This is a direct put of predefined DatasetRef. 

949 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

950 if run is not None: 

951 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

952 # If registry already has a dataset with the same dataset ID, 

953 # dataset type and DataId, then _importDatasets will do nothing and 

954 # just return an original ref. We have to raise in this case, there 

955 # is a datastore check below for that. 

956 self._registry._importDatasets([datasetRefOrType], expand=True) 

957 # Before trying to write to the datastore check that it does not 

958 # know this dataset. This is prone to races, of course. 

959 if self._datastore.knows(datasetRefOrType): 

960 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

961 # Try to write dataset to the datastore, if it fails due to a race 

962 # with another write, the content of stored data may be 

963 # unpredictable. 

964 try: 

965 self._datastore.put(obj, datasetRefOrType) 

966 except IntegrityError as e: 

967 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

968 return datasetRefOrType 

969 

970 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

971 if not self.isWriteable(): 

972 raise TypeError("Butler is read-only.") 

973 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

974 

975 # Handle dimension records in dataId 

976 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

977 

978 # Add Registry Dataset entry. 

979 dataId = self._registry.expandDataId(dataId, dimensions=datasetType.dimensions, **kwargs) 

980 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

981 self._datastore.put(obj, ref) 

982 

983 return ref 

984 

985 # TODO: remove on DM-40067. 

986 @deprecated( 

987 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

988 " Please use Butler.get(). Will be removed after v26.0.", 

989 version="v26.0", 

990 category=FutureWarning, 

991 ) 

992 def getDirect( 

993 self, 

994 ref: DatasetRef, 

995 *, 

996 parameters: dict[str, Any] | None = None, 

997 storageClass: StorageClass | str | None = None, 

998 ) -> Any: 

999 """Retrieve a stored dataset. 

1000 

1001 Parameters 

1002 ---------- 

1003 ref : `DatasetRef` 

1004 Resolved reference to an already stored dataset. 

1005 parameters : `dict` 

1006 Additional StorageClass-defined options to control reading, 

1007 typically used to efficiently read only a subset of the dataset. 

1008 storageClass : `StorageClass` or `str`, optional 

1009 The storage class to be used to override the Python type 

1010 returned by this method. By default the returned type matches 

1011 the dataset type definition for this dataset. Specifying a 

1012 read `StorageClass` can force a different type to be returned. 

1013 This type must be compatible with the original type. 

1014 

1015 Returns 

1016 ------- 

1017 obj : `object` 

1018 The dataset. 

1019 """ 

1020 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1021 

1022 # TODO: remove on DM-40067. 

1023 @deprecated( 

1024 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1025 "Please use Butler.getDeferred(). Will be removed after v26.0.", 

1026 version="v26.0", 

1027 category=FutureWarning, 

1028 ) 

1029 def getDirectDeferred( 

1030 self, 

1031 ref: DatasetRef, 

1032 *, 

1033 parameters: dict[str, Any] | None = None, 

1034 storageClass: str | StorageClass | None = None, 

1035 ) -> DeferredDatasetHandle: 

1036 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1037 from a resolved `DatasetRef`. 

1038 

1039 Parameters 

1040 ---------- 

1041 ref : `DatasetRef` 

1042 Resolved reference to an already stored dataset. 

1043 parameters : `dict` 

1044 Additional StorageClass-defined options to control reading, 

1045 typically used to efficiently read only a subset of the dataset. 

1046 storageClass : `StorageClass` or `str`, optional 

1047 The storage class to be used to override the Python type 

1048 returned by this method. By default the returned type matches 

1049 the dataset type definition for this dataset. Specifying a 

1050 read `StorageClass` can force a different type to be returned. 

1051 This type must be compatible with the original type. 

1052 

1053 Returns 

1054 ------- 

1055 obj : `DeferredDatasetHandle` 

1056 A handle which can be used to retrieve a dataset at a later time. 

1057 

1058 Raises 

1059 ------ 

1060 LookupError 

1061 Raised if no matching dataset exists in the `Registry`. 

1062 """ 

1063 # Check that dataset is known to the datastore. 

1064 if not self._datastore.knows(ref): 

1065 raise LookupError(f"Dataset reference {ref} is not known to datastore.") 

1066 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1067 

1068 def getDeferred( 

1069 self, 

1070 datasetRefOrType: DatasetRef | DatasetType | str, 

1071 /, 

1072 dataId: DataId | None = None, 

1073 *, 

1074 parameters: dict | None = None, 

1075 collections: Any = None, 

1076 storageClass: str | StorageClass | None = None, 

1077 **kwargs: Any, 

1078 ) -> DeferredDatasetHandle: 

1079 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1080 after an immediate registry lookup. 

1081 

1082 Parameters 

1083 ---------- 

1084 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1085 When `DatasetRef` the `dataId` should be `None`. 

1086 Otherwise the `DatasetType` or name thereof. 

1087 dataId : `dict` or `DataCoordinate`, optional 

1088 A `dict` of `Dimension` link name, value pairs that label the 

1089 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1090 should be provided as the first argument. 

1091 parameters : `dict` 

1092 Additional StorageClass-defined options to control reading, 

1093 typically used to efficiently read only a subset of the dataset. 

1094 collections : Any, optional 

1095 Collections to be searched, overriding ``self.collections``. 

1096 Can be any of the types supported by the ``collections`` argument 

1097 to butler construction. 

1098 storageClass : `StorageClass` or `str`, optional 

1099 The storage class to be used to override the Python type 

1100 returned by this method. By default the returned type matches 

1101 the dataset type definition for this dataset. Specifying a 

1102 read `StorageClass` can force a different type to be returned. 

1103 This type must be compatible with the original type. 

1104 **kwargs 

1105 Additional keyword arguments used to augment or construct a 

1106 `DataId`. See `DataId` parameters. 

1107 

1108 Returns 

1109 ------- 

1110 obj : `DeferredDatasetHandle` 

1111 A handle which can be used to retrieve a dataset at a later time. 

1112 

1113 Raises 

1114 ------ 

1115 LookupError 

1116 Raised if no matching dataset exists in the `Registry` or 

1117 datastore. 

1118 ValueError 

1119 Raised if a resolved `DatasetRef` was passed as an input, but it 

1120 differs from the one found in the registry. 

1121 TypeError 

1122 Raised if no collections were provided. 

1123 """ 

1124 if isinstance(datasetRefOrType, DatasetRef): 

1125 # Do the quick check first and if that fails, check for artifact 

1126 # existence. This is necessary for datastores that are configured 

1127 # in trust mode where there won't be a record but there will be 

1128 # a file. 

1129 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1130 ref = datasetRefOrType 

1131 else: 

1132 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1133 else: 

1134 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1135 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1136 

1137 def get( 

1138 self, 

1139 datasetRefOrType: DatasetRef | DatasetType | str, 

1140 /, 

1141 dataId: DataId | None = None, 

1142 *, 

1143 parameters: dict[str, Any] | None = None, 

1144 collections: Any = None, 

1145 storageClass: StorageClass | str | None = None, 

1146 **kwargs: Any, 

1147 ) -> Any: 

1148 """Retrieve a stored dataset. 

1149 

1150 Parameters 

1151 ---------- 

1152 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1153 When `DatasetRef` the `dataId` should be `None`. 

1154 Otherwise the `DatasetType` or name thereof. 

1155 If a resolved `DatasetRef`, the associated dataset 

1156 is returned directly without additional querying. 

1157 dataId : `dict` or `DataCoordinate` 

1158 A `dict` of `Dimension` link name, value pairs that label the 

1159 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1160 should be provided as the first argument. 

1161 parameters : `dict` 

1162 Additional StorageClass-defined options to control reading, 

1163 typically used to efficiently read only a subset of the dataset. 

1164 collections : Any, optional 

1165 Collections to be searched, overriding ``self.collections``. 

1166 Can be any of the types supported by the ``collections`` argument 

1167 to butler construction. 

1168 storageClass : `StorageClass` or `str`, optional 

1169 The storage class to be used to override the Python type 

1170 returned by this method. By default the returned type matches 

1171 the dataset type definition for this dataset. Specifying a 

1172 read `StorageClass` can force a different type to be returned. 

1173 This type must be compatible with the original type. 

1174 **kwargs 

1175 Additional keyword arguments used to augment or construct a 

1176 `DataCoordinate`. See `DataCoordinate.standardize` 

1177 parameters. 

1178 

1179 Returns 

1180 ------- 

1181 obj : `object` 

1182 The dataset. 

1183 

1184 Raises 

1185 ------ 

1186 LookupError 

1187 Raised if no matching dataset exists in the `Registry`. 

1188 TypeError 

1189 Raised if no collections were provided. 

1190 

1191 Notes 

1192 ----- 

1193 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1194 this method requires that the given data ID include temporal dimensions 

1195 beyond the dimensions of the dataset type itself, in order to find the 

1196 dataset with the appropriate validity range. For example, a "bias" 

1197 dataset with native dimensions ``{instrument, detector}`` could be 

1198 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1199 ``exposure`` is a temporal dimension. 

1200 """ 

1201 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1202 ref = self._findDatasetRef( 

1203 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs 

1204 ) 

1205 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1206 

1207 def getURIs( 

1208 self, 

1209 datasetRefOrType: DatasetRef | DatasetType | str, 

1210 /, 

1211 dataId: DataId | None = None, 

1212 *, 

1213 predict: bool = False, 

1214 collections: Any = None, 

1215 run: str | None = None, 

1216 **kwargs: Any, 

1217 ) -> DatasetRefURIs: 

1218 """Return the URIs associated with the dataset. 

1219 

1220 Parameters 

1221 ---------- 

1222 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1223 When `DatasetRef` the `dataId` should be `None`. 

1224 Otherwise the `DatasetType` or name thereof. 

1225 dataId : `dict` or `DataCoordinate` 

1226 A `dict` of `Dimension` link name, value pairs that label the 

1227 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1228 should be provided as the first argument. 

1229 predict : `bool` 

1230 If `True`, allow URIs to be returned of datasets that have not 

1231 been written. 

1232 collections : Any, optional 

1233 Collections to be searched, overriding ``self.collections``. 

1234 Can be any of the types supported by the ``collections`` argument 

1235 to butler construction. 

1236 run : `str`, optional 

1237 Run to use for predictions, overriding ``self.run``. 

1238 **kwargs 

1239 Additional keyword arguments used to augment or construct a 

1240 `DataCoordinate`. See `DataCoordinate.standardize` 

1241 parameters. 

1242 

1243 Returns 

1244 ------- 

1245 uris : `DatasetRefURIs` 

1246 The URI to the primary artifact associated with this dataset (if 

1247 the dataset was disassembled within the datastore this may be 

1248 `None`), and the URIs to any components associated with the dataset 

1249 artifact. (can be empty if there are no components). 

1250 """ 

1251 ref = self._findDatasetRef( 

1252 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1253 ) 

1254 return self._datastore.getURIs(ref, predict) 

1255 

1256 def getURI( 

1257 self, 

1258 datasetRefOrType: DatasetRef | DatasetType | str, 

1259 /, 

1260 dataId: DataId | None = None, 

1261 *, 

1262 predict: bool = False, 

1263 collections: Any = None, 

1264 run: str | None = None, 

1265 **kwargs: Any, 

1266 ) -> ResourcePath: 

1267 """Return the URI to the Dataset. 

1268 

1269 Parameters 

1270 ---------- 

1271 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1272 When `DatasetRef` the `dataId` should be `None`. 

1273 Otherwise the `DatasetType` or name thereof. 

1274 dataId : `dict` or `DataCoordinate` 

1275 A `dict` of `Dimension` link name, value pairs that label the 

1276 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1277 should be provided as the first argument. 

1278 predict : `bool` 

1279 If `True`, allow URIs to be returned of datasets that have not 

1280 been written. 

1281 collections : Any, optional 

1282 Collections to be searched, overriding ``self.collections``. 

1283 Can be any of the types supported by the ``collections`` argument 

1284 to butler construction. 

1285 run : `str`, optional 

1286 Run to use for predictions, overriding ``self.run``. 

1287 **kwargs 

1288 Additional keyword arguments used to augment or construct a 

1289 `DataCoordinate`. See `DataCoordinate.standardize` 

1290 parameters. 

1291 

1292 Returns 

1293 ------- 

1294 uri : `lsst.resources.ResourcePath` 

1295 URI pointing to the Dataset within the datastore. If the 

1296 Dataset does not exist in the datastore, and if ``predict`` is 

1297 `True`, the URI will be a prediction and will include a URI 

1298 fragment "#predicted". 

1299 If the datastore does not have entities that relate well 

1300 to the concept of a URI the returned URI string will be 

1301 descriptive. The returned URI is not guaranteed to be obtainable. 

1302 

1303 Raises 

1304 ------ 

1305 LookupError 

1306 A URI has been requested for a dataset that does not exist and 

1307 guessing is not allowed. 

1308 ValueError 

1309 Raised if a resolved `DatasetRef` was passed as an input, but it 

1310 differs from the one found in the registry. 

1311 TypeError 

1312 Raised if no collections were provided. 

1313 RuntimeError 

1314 Raised if a URI is requested for a dataset that consists of 

1315 multiple artifacts. 

1316 """ 

1317 primary, components = self.getURIs( 

1318 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1319 ) 

1320 

1321 if primary is None or components: 

1322 raise RuntimeError( 

1323 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1324 "Use Butler.getURIs() instead." 

1325 ) 

1326 return primary 

1327 

1328 def get_dataset_type(self, name: str) -> DatasetType: 

1329 return self._registry.getDatasetType(name) 

1330 

1331 def get_dataset( 

1332 self, 

1333 id: DatasetId, 

1334 storage_class: str | StorageClass | None = None, 

1335 dimension_records: bool = False, 

1336 datastore_records: bool = False, 

1337 ) -> DatasetRef | None: 

1338 ref = self._registry.getDataset(id) 

1339 if ref is not None: 

1340 if dimension_records: 

1341 ref = ref.expanded( 

1342 self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions) 

1343 ) 

1344 if storage_class: 

1345 ref = ref.overrideStorageClass(storage_class) 

1346 if datastore_records: 

1347 ref = self._registry.get_datastore_records(ref) 

1348 return ref 

1349 

1350 def find_dataset( 

1351 self, 

1352 dataset_type: DatasetType | str, 

1353 data_id: DataId | None = None, 

1354 *, 

1355 collections: str | Sequence[str] | None = None, 

1356 timespan: Timespan | None = None, 

1357 storage_class: str | StorageClass | None = None, 

1358 dimension_records: bool = False, 

1359 datastore_records: bool = False, 

1360 **kwargs: Any, 

1361 ) -> DatasetRef | None: 

1362 # Handle any parts of the dataID that are not using primary dimension 

1363 # keys. 

1364 if isinstance(dataset_type, str): 

1365 actual_type = self.get_dataset_type(dataset_type) 

1366 else: 

1367 actual_type = dataset_type 

1368 data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs) 

1369 

1370 ref = self._registry.findDataset( 

1371 dataset_type, 

1372 data_id, 

1373 collections=collections, 

1374 timespan=timespan, 

1375 datastore_records=datastore_records, 

1376 **kwargs, 

1377 ) 

1378 if ref is not None and dimension_records: 

1379 ref = ref.expanded(self._registry.expandDataId(ref.dataId, dimensions=ref.datasetType.dimensions)) 

1380 if ref is not None and storage_class is not None: 

1381 ref = ref.overrideStorageClass(storage_class) 

1382 return ref 

1383 

1384 def retrieveArtifacts( 

1385 self, 

1386 refs: Iterable[DatasetRef], 

1387 destination: ResourcePathExpression, 

1388 transfer: str = "auto", 

1389 preserve_path: bool = True, 

1390 overwrite: bool = False, 

1391 ) -> list[ResourcePath]: 

1392 # Docstring inherited. 

1393 return self._datastore.retrieveArtifacts( 

1394 refs, 

1395 ResourcePath(destination), 

1396 transfer=transfer, 

1397 preserve_path=preserve_path, 

1398 overwrite=overwrite, 

1399 ) 

1400 

1401 def exists( 

1402 self, 

1403 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1404 /, 

1405 data_id: DataId | None = None, 

1406 *, 

1407 full_check: bool = True, 

1408 collections: Any = None, 

1409 **kwargs: Any, 

1410 ) -> DatasetExistence: 

1411 # Docstring inherited. 

1412 existence = DatasetExistence.UNRECOGNIZED 

1413 

1414 if isinstance(dataset_ref_or_type, DatasetRef): 

1415 if collections is not None: 

1416 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1417 if data_id is not None: 

1418 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1419 ref = dataset_ref_or_type 

1420 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1421 if registry_ref is not None: 

1422 existence |= DatasetExistence.RECORDED 

1423 

1424 if dataset_ref_or_type != registry_ref: 

1425 # This could mean that storage classes differ, so we should 

1426 # check for that but use the registry ref for the rest of 

1427 # the method. 

1428 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1429 # Use the registry version from now on. 

1430 ref = registry_ref 

1431 else: 

1432 raise ValueError( 

1433 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1434 f"in registry but has different incompatible values ({registry_ref})." 

1435 ) 

1436 else: 

1437 try: 

1438 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1439 except (LookupError, TypeError, NoDefaultCollectionError): 

1440 return existence 

1441 existence |= DatasetExistence.RECORDED 

1442 

1443 if self._datastore.knows(ref): 

1444 existence |= DatasetExistence.DATASTORE 

1445 

1446 if full_check: 

1447 if self._datastore.exists(ref): 

1448 existence |= DatasetExistence._ARTIFACT 

1449 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1450 # Do not add this flag if we have no other idea about a dataset. 

1451 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1452 

1453 return existence 

1454 

1455 def _exists_many( 

1456 self, 

1457 refs: Iterable[DatasetRef], 

1458 /, 

1459 *, 

1460 full_check: bool = True, 

1461 ) -> dict[DatasetRef, DatasetExistence]: 

1462 # Docstring inherited. 

1463 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1464 

1465 # Registry does not have a bulk API to check for a ref. 

1466 for ref in refs: 

1467 registry_ref = self._registry.getDataset(ref.id) 

1468 if registry_ref is not None: 

1469 # It is possible, albeit unlikely, that the given ref does 

1470 # not match the one in registry even though the UUID matches. 

1471 # When checking a single ref we raise, but it's impolite to 

1472 # do that when potentially hundreds of refs are being checked. 

1473 # We could change the API to only accept UUIDs and that would 

1474 # remove the ability to even check and remove the worry 

1475 # about differing storage classes. Given the ongoing discussion 

1476 # on refs vs UUIDs and whether to raise or have a new 

1477 # private flag, treat this as a private API for now. 

1478 existence[ref] |= DatasetExistence.RECORDED 

1479 

1480 # Ask datastore if it knows about these refs. 

1481 knows = self._datastore.knows_these(refs) 

1482 for ref, known in knows.items(): 

1483 if known: 

1484 existence[ref] |= DatasetExistence.DATASTORE 

1485 

1486 if full_check: 

1487 mexists = self._datastore.mexists(refs) 

1488 for ref, exists in mexists.items(): 

1489 if exists: 

1490 existence[ref] |= DatasetExistence._ARTIFACT 

1491 else: 

1492 # Do not set this flag if nothing is known about the dataset. 

1493 for ref in existence: 

1494 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1495 existence[ref] |= DatasetExistence._ASSUMED 

1496 

1497 return existence 

1498 

1499 # TODO: remove on DM-40079. 

1500 @deprecated( 

1501 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.", 

1502 version="v26.0", 

1503 category=FutureWarning, 

1504 ) 

1505 def datasetExists( 

1506 self, 

1507 datasetRefOrType: DatasetRef | DatasetType | str, 

1508 dataId: DataId | None = None, 

1509 *, 

1510 collections: Any = None, 

1511 **kwargs: Any, 

1512 ) -> bool: 

1513 """Return True if the Dataset is actually present in the Datastore. 

1514 

1515 Parameters 

1516 ---------- 

1517 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1518 When `DatasetRef` the `dataId` should be `None`. 

1519 Otherwise the `DatasetType` or name thereof. 

1520 dataId : `dict` or `DataCoordinate` 

1521 A `dict` of `Dimension` link name, value pairs that label the 

1522 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1523 should be provided as the first argument. 

1524 collections : Any, optional 

1525 Collections to be searched, overriding ``self.collections``. 

1526 Can be any of the types supported by the ``collections`` argument 

1527 to butler construction. 

1528 **kwargs 

1529 Additional keyword arguments used to augment or construct a 

1530 `DataCoordinate`. See `DataCoordinate.standardize` 

1531 parameters. 

1532 

1533 Raises 

1534 ------ 

1535 LookupError 

1536 Raised if the dataset is not even present in the Registry. 

1537 ValueError 

1538 Raised if a resolved `DatasetRef` was passed as an input, but it 

1539 differs from the one found in the registry. 

1540 NoDefaultCollectionError 

1541 Raised if no collections were provided. 

1542 """ 

1543 # A resolved ref may be given that is not known to this butler. 

1544 if isinstance(datasetRefOrType, DatasetRef): 

1545 ref = self._registry.getDataset(datasetRefOrType.id) 

1546 if ref is None: 

1547 raise LookupError( 

1548 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1549 ) 

1550 else: 

1551 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1552 return self._datastore.exists(ref) 

1553 

1554 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1555 # Docstring inherited. 

1556 if not self.isWriteable(): 

1557 raise TypeError("Butler is read-only.") 

1558 names = list(names) 

1559 refs: list[DatasetRef] = [] 

1560 for name in names: 

1561 collectionType = self._registry.getCollectionType(name) 

1562 if collectionType is not CollectionType.RUN: 

1563 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1564 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1565 with self._datastore.transaction(), self._registry.transaction(): 

1566 if unstore: 

1567 self._datastore.trash(refs) 

1568 else: 

1569 self._datastore.forget(refs) 

1570 for name in names: 

1571 self._registry.removeCollection(name) 

1572 if unstore: 

1573 # Point of no return for removing artifacts 

1574 self._datastore.emptyTrash() 

1575 

1576 def pruneDatasets( 

1577 self, 

1578 refs: Iterable[DatasetRef], 

1579 *, 

1580 disassociate: bool = True, 

1581 unstore: bool = False, 

1582 tags: Iterable[str] = (), 

1583 purge: bool = False, 

1584 ) -> None: 

1585 # docstring inherited from LimitedButler 

1586 

1587 if not self.isWriteable(): 

1588 raise TypeError("Butler is read-only.") 

1589 if purge: 

1590 if not disassociate: 

1591 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1592 if not unstore: 

1593 raise TypeError("Cannot pass purge=True without unstore=True.") 

1594 elif disassociate: 

1595 tags = tuple(tags) 

1596 if not tags: 

1597 raise TypeError("No tags provided but disassociate=True.") 

1598 for tag in tags: 

1599 collectionType = self._registry.getCollectionType(tag) 

1600 if collectionType is not CollectionType.TAGGED: 

1601 raise TypeError( 

1602 f"Cannot disassociate from collection '{tag}' " 

1603 f"of non-TAGGED type {collectionType.name}." 

1604 ) 

1605 # Transform possibly-single-pass iterable into something we can iterate 

1606 # over multiple times. 

1607 refs = list(refs) 

1608 # Pruning a component of a DatasetRef makes no sense since registry 

1609 # doesn't know about components and datastore might not store 

1610 # components in a separate file 

1611 for ref in refs: 

1612 if ref.datasetType.component(): 

1613 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1614 # We don't need an unreliable Datastore transaction for this, because 

1615 # we've been extra careful to ensure that Datastore.trash only involves 

1616 # mutating the Registry (it can _look_ at Datastore-specific things, 

1617 # but shouldn't change them), and hence all operations here are 

1618 # Registry operations. 

1619 with self._datastore.transaction(), self._registry.transaction(): 

1620 if unstore: 

1621 self._datastore.trash(refs) 

1622 if purge: 

1623 self._registry.removeDatasets(refs) 

1624 elif disassociate: 

1625 assert tags, "Guaranteed by earlier logic in this function." 

1626 for tag in tags: 

1627 self._registry.disassociate(tag, refs) 

1628 # We've exited the Registry transaction, and apparently committed. 

1629 # (if there was an exception, everything rolled back, and it's as if 

1630 # nothing happened - and we never get here). 

1631 # Datastore artifacts are not yet gone, but they're clearly marked 

1632 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1633 # problems we can try again later, and if manual administrative 

1634 # intervention is required, it's pretty clear what that should entail: 

1635 # deleting everything on disk and in private Datastore tables that is 

1636 # in the dataset_location_trash table. 

1637 if unstore: 

1638 # Point of no return for removing artifacts 

1639 self._datastore.emptyTrash() 

1640 

1641 @transactional 

1642 def ingest( 

1643 self, 

1644 *datasets: FileDataset, 

1645 transfer: str | None = "auto", 

1646 run: str | None = None, 

1647 idGenerationMode: DatasetIdGenEnum | None = None, 

1648 record_validation_info: bool = True, 

1649 ) -> None: 

1650 # Docstring inherited. 

1651 if not self.isWriteable(): 

1652 raise TypeError("Butler is read-only.") 

1653 

1654 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1655 if not datasets: 

1656 return 

1657 

1658 if idGenerationMode is not None: 

1659 warnings.warn( 

1660 "The idGenerationMode parameter is no longer used and is ignored. " 

1661 " Will be removed after v26.0", 

1662 FutureWarning, 

1663 stacklevel=2, 

1664 ) 

1665 

1666 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1667 

1668 # We need to reorganize all the inputs so that they are grouped 

1669 # by dataset type and run. Multiple refs in a single FileDataset 

1670 # are required to share the run and dataset type. 

1671 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

1672 groupedData: GroupedData = defaultdict(list) 

1673 

1674 # Track DataIDs that are being ingested so we can spot issues early 

1675 # with duplication. Retain previous FileDataset so we can report it. 

1676 groupedDataIds: MutableMapping[ 

1677 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1678 ] = defaultdict(dict) 

1679 

1680 used_run = False 

1681 

1682 # And the nested loop that populates it: 

1683 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1684 # Somewhere to store pre-existing refs if we have an 

1685 # execution butler. 

1686 existingRefs: list[DatasetRef] = [] 

1687 

1688 for ref in dataset.refs: 

1689 assert ref.run is not None # For mypy 

1690 group_key = (ref.datasetType, ref.run) 

1691 

1692 if ref.dataId in groupedDataIds[group_key]: 

1693 raise ConflictingDefinitionError( 

1694 f"Ingest conflict. Dataset {dataset.path} has same" 

1695 " DataId as other ingest dataset" 

1696 f" {groupedDataIds[group_key][ref.dataId].path} " 

1697 f" ({ref.dataId})" 

1698 ) 

1699 

1700 groupedDataIds[group_key][ref.dataId] = dataset 

1701 

1702 if existingRefs: 

1703 if len(dataset.refs) != len(existingRefs): 

1704 # Keeping track of partially pre-existing datasets is hard 

1705 # and should generally never happen. For now don't allow 

1706 # it. 

1707 raise ConflictingDefinitionError( 

1708 f"For dataset {dataset.path} some dataIds already exist" 

1709 " in registry but others do not. This is not supported." 

1710 ) 

1711 

1712 # Store expanded form in the original FileDataset. 

1713 dataset.refs = existingRefs 

1714 else: 

1715 groupedData[group_key].append(dataset) 

1716 

1717 if not used_run and run is not None: 

1718 warnings.warn( 

1719 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

1720 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

1721 category=FutureWarning, 

1722 stacklevel=3, # Take into account the @transactional decorator. 

1723 ) 

1724 

1725 # Now we can bulk-insert into Registry for each DatasetType. 

1726 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1727 groupedData.items(), desc="Bulk-inserting datasets by type" 

1728 ): 

1729 refs_to_import = [] 

1730 for dataset in grouped_datasets: 

1731 refs_to_import.extend(dataset.refs) 

1732 

1733 n_refs = len(refs_to_import) 

1734 _LOG.verbose( 

1735 "Importing %d ref%s of dataset type %r into run %r", 

1736 n_refs, 

1737 "" if n_refs == 1 else "s", 

1738 datasetType.name, 

1739 this_run, 

1740 ) 

1741 

1742 # Import the refs and expand the DataCoordinates since we can't 

1743 # guarantee that they are expanded and Datastore will need 

1744 # the records. 

1745 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1746 assert set(imported_refs) == set(refs_to_import) 

1747 

1748 # Replace all the refs in the FileDataset with expanded versions. 

1749 # Pull them off in the order we put them on the list. 

1750 for dataset in grouped_datasets: 

1751 n_dataset_refs = len(dataset.refs) 

1752 dataset.refs = imported_refs[:n_dataset_refs] 

1753 del imported_refs[:n_dataset_refs] 

1754 

1755 # Bulk-insert everything into Datastore. 

1756 # We do not know if any of the registry entries already existed 

1757 # (_importDatasets only complains if they exist but differ) so 

1758 # we have to catch IntegrityError explicitly. 

1759 try: 

1760 self._datastore.ingest( 

1761 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1762 ) 

1763 except IntegrityError as e: 

1764 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1765 

1766 @contextlib.contextmanager 

1767 def export( 

1768 self, 

1769 *, 

1770 directory: str | None = None, 

1771 filename: str | None = None, 

1772 format: str | None = None, 

1773 transfer: str | None = None, 

1774 ) -> Iterator[RepoExportContext]: 

1775 # Docstring inherited. 

1776 if directory is None and transfer is not None: 

1777 raise TypeError("Cannot transfer without providing a directory.") 

1778 if transfer == "move": 

1779 raise TypeError("Transfer may not be 'move': export is read-only") 

1780 if format is None: 

1781 if filename is None: 

1782 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1783 else: 

1784 _, format = os.path.splitext(filename) 

1785 if not format: 

1786 raise ValueError("Please specify a file extension to determine export format.") 

1787 format = format[1:] # Strip leading "."" 

1788 elif filename is None: 

1789 filename = f"export.{format}" 

1790 if directory is not None: 

1791 filename = os.path.join(directory, filename) 

1792 formats = self._config["repo_transfer_formats"] 

1793 if format not in formats: 

1794 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1795 BackendClass = get_class_of(formats[format, "export"]) 

1796 with open(filename, "w") as stream: 

1797 backend = BackendClass(stream, universe=self.dimensions) 

1798 try: 

1799 helper = RepoExportContext( 

1800 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1801 ) 

1802 yield helper 

1803 except BaseException: 

1804 raise 

1805 else: 

1806 helper._finish() 

1807 

1808 def import_( 

1809 self, 

1810 *, 

1811 directory: ResourcePathExpression | None = None, 

1812 filename: ResourcePathExpression | TextIO | None = None, 

1813 format: str | None = None, 

1814 transfer: str | None = None, 

1815 skip_dimensions: set | None = None, 

1816 ) -> None: 

1817 # Docstring inherited. 

1818 if not self.isWriteable(): 

1819 raise TypeError("Butler is read-only.") 

1820 if format is None: 

1821 if filename is None: 

1822 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1823 else: 

1824 _, format = os.path.splitext(filename) # type: ignore 

1825 elif filename is None: 

1826 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1827 if directory is not None: 

1828 directory = ResourcePath(directory, forceDirectory=True) 

1829 # mypy doesn't think this will work but it does in python >= 3.10. 

1830 if isinstance(filename, ResourcePathExpression): # type: ignore 

1831 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1832 if not filename.isabs() and directory is not None: 

1833 potential = directory.join(filename) 

1834 exists_in_cwd = filename.exists() 

1835 exists_in_dir = potential.exists() 

1836 if exists_in_cwd and exists_in_dir: 

1837 _LOG.warning( 

1838 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1839 "Additionally, the file exists relative to the given search directory (%s). " 

1840 "Using the export file in the given directory.", 

1841 filename, 

1842 potential, 

1843 ) 

1844 # Given they specified an explicit directory and that 

1845 # directory has the export file in it, assume that that 

1846 # is what was meant despite the file in cwd. 

1847 filename = potential 

1848 elif exists_in_dir: 

1849 filename = potential 

1850 elif not exists_in_cwd and not exists_in_dir: 

1851 # Raise early. 

1852 raise FileNotFoundError( 

1853 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1854 ) 

1855 BackendClass: type[RepoImportBackend] = get_class_of( 

1856 self._config["repo_transfer_formats"][format]["import"] 

1857 ) 

1858 

1859 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1860 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1861 backend.register() 

1862 with self.transaction(): 

1863 backend.load( 

1864 self._datastore, 

1865 directory=directory, 

1866 transfer=transfer, 

1867 skip_dimensions=skip_dimensions, 

1868 ) 

1869 

1870 if isinstance(filename, ResourcePath): 

1871 # We can not use open() here at the moment because of 

1872 # DM-38589 since yaml does stream.read(8192) in a loop. 

1873 stream = io.StringIO(filename.read().decode()) 

1874 doImport(stream) 

1875 else: 

1876 doImport(filename) # type: ignore 

1877 

1878 def transfer_from( 

1879 self, 

1880 source_butler: LimitedButler, 

1881 source_refs: Iterable[DatasetRef], 

1882 transfer: str = "auto", 

1883 skip_missing: bool = True, 

1884 register_dataset_types: bool = False, 

1885 transfer_dimensions: bool = False, 

1886 ) -> collections.abc.Collection[DatasetRef]: 

1887 # Docstring inherited. 

1888 if not self.isWriteable(): 

1889 raise TypeError("Butler is read-only.") 

1890 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1891 

1892 # Will iterate through the refs multiple times so need to convert 

1893 # to a list if this isn't a collection. 

1894 if not isinstance(source_refs, collections.abc.Collection): 

1895 source_refs = list(source_refs) 

1896 

1897 original_count = len(source_refs) 

1898 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

1899 

1900 # In some situations the datastore artifact may be missing 

1901 # and we do not want that registry entry to be imported. 

1902 # Asking datastore is not sufficient, the records may have been 

1903 # purged, we have to ask for the (predicted) URI and check 

1904 # existence explicitly. Execution butler is set up exactly like 

1905 # this with no datastore records. 

1906 artifact_existence: dict[ResourcePath, bool] = {} 

1907 if skip_missing: 

1908 dataset_existence = source_butler._datastore.mexists( 

1909 source_refs, artifact_existence=artifact_existence 

1910 ) 

1911 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1912 filtered_count = len(source_refs) 

1913 n_missing = original_count - filtered_count 

1914 _LOG.verbose( 

1915 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

1916 n_missing, 

1917 "" if n_missing == 1 else "s", 

1918 filtered_count, 

1919 ) 

1920 

1921 # Importing requires that we group the refs by dataset type and run 

1922 # before doing the import. 

1923 source_dataset_types = set() 

1924 grouped_refs = defaultdict(list) 

1925 for ref in source_refs: 

1926 grouped_refs[ref.datasetType, ref.run].append(ref) 

1927 source_dataset_types.add(ref.datasetType) 

1928 

1929 # Check to see if the dataset type in the source butler has 

1930 # the same definition in the target butler and register missing 

1931 # ones if requested. Registration must happen outside a transaction. 

1932 newly_registered_dataset_types = set() 

1933 for datasetType in source_dataset_types: 

1934 if register_dataset_types: 

1935 # Let this raise immediately if inconsistent. Continuing 

1936 # on to find additional inconsistent dataset types 

1937 # might result in additional unwanted dataset types being 

1938 # registered. 

1939 if self._registry.registerDatasetType(datasetType): 

1940 newly_registered_dataset_types.add(datasetType) 

1941 else: 

1942 # If the dataset type is missing, let it fail immediately. 

1943 target_dataset_type = self.get_dataset_type(datasetType.name) 

1944 if target_dataset_type != datasetType: 

1945 raise ConflictingDefinitionError( 

1946 "Source butler dataset type differs from definition" 

1947 f" in target butler: {datasetType} !=" 

1948 f" {target_dataset_type}" 

1949 ) 

1950 if newly_registered_dataset_types: 

1951 # We may have registered some even if there were inconsistencies 

1952 # but should let people know (or else remove them again). 

1953 _LOG.verbose( 

1954 "Registered the following dataset types in the target Butler: %s", 

1955 ", ".join(d.name for d in newly_registered_dataset_types), 

1956 ) 

1957 else: 

1958 _LOG.verbose("All required dataset types are known to the target Butler") 

1959 

1960 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1961 if transfer_dimensions: 

1962 # Collect all the dimension records for these refs. 

1963 # All dimensions are to be copied but the list of valid dimensions 

1964 # come from this butler's universe. 

1965 elements = frozenset( 

1966 element 

1967 for element in self.dimensions.elements 

1968 if element.hasTable() and element.viewOf is None 

1969 ) 

1970 dataIds = {ref.dataId for ref in source_refs} 

1971 # This logic comes from saveDataIds. 

1972 for dataId in dataIds: 

1973 # Need an expanded record, if not expanded that we need a full 

1974 # butler with registry (allow mocks with registry too). 

1975 if not dataId.hasRecords(): 

1976 if registry := getattr(source_butler, "registry", None): 

1977 dataId = registry.expandDataId(dataId) 

1978 else: 

1979 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1980 # If this butler doesn't know about a dimension in the source 

1981 # butler things will break later. 

1982 for element_name in dataId.dimensions.elements: 

1983 record = dataId.records[element_name] 

1984 if record is not None and record.definition in elements: 

1985 dimension_records[record.definition].setdefault(record.dataId, record) 

1986 

1987 handled_collections: set[str] = set() 

1988 

1989 # Do all the importing in a single transaction. 

1990 with self.transaction(): 

1991 if dimension_records: 

1992 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

1993 for element, r in dimension_records.items(): 

1994 records = [r[dataId] for dataId in r] 

1995 # Assume that if the record is already present that we can 

1996 # use it without having to check that the record metadata 

1997 # is consistent. 

1998 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1999 

2000 n_imported = 0 

2001 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

2002 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

2003 ): 

2004 if run not in handled_collections: 

2005 # May need to create output collection. If source butler 

2006 # has a registry, ask for documentation string. 

2007 run_doc = None 

2008 if registry := getattr(source_butler, "registry", None): 

2009 run_doc = registry.getCollectionDocumentation(run) 

2010 registered = self._registry.registerRun(run, doc=run_doc) 

2011 handled_collections.add(run) 

2012 if registered: 

2013 _LOG.verbose("Creating output run %s", run) 

2014 

2015 n_refs = len(refs_to_import) 

2016 _LOG.verbose( 

2017 "Importing %d ref%s of dataset type %s into run %s", 

2018 n_refs, 

2019 "" if n_refs == 1 else "s", 

2020 datasetType.name, 

2021 run, 

2022 ) 

2023 

2024 # Assume we are using UUIDs and the source refs will match 

2025 # those imported. 

2026 imported_refs = self._registry._importDatasets(refs_to_import) 

2027 assert set(imported_refs) == set(refs_to_import) 

2028 n_imported += len(imported_refs) 

2029 

2030 assert len(source_refs) == n_imported 

2031 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

2032 

2033 # Ask the datastore to transfer. The datastore has to check that 

2034 # the source datastore is compatible with the target datastore. 

2035 accepted, rejected = self._datastore.transfer_from( 

2036 source_butler._datastore, 

2037 source_refs, 

2038 transfer=transfer, 

2039 artifact_existence=artifact_existence, 

2040 ) 

2041 if rejected: 

2042 # For now, accept the registry entries but not the files. 

2043 _LOG.warning( 

2044 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2045 len(rejected), 

2046 len(accepted), 

2047 datasetType, 

2048 run, 

2049 ) 

2050 

2051 return source_refs 

2052 

2053 def validateConfiguration( 

2054 self, 

2055 logFailures: bool = False, 

2056 datasetTypeNames: Iterable[str] | None = None, 

2057 ignore: Iterable[str] | None = None, 

2058 ) -> None: 

2059 # Docstring inherited. 

2060 if datasetTypeNames: 

2061 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

2062 else: 

2063 datasetTypes = list(self._registry.queryDatasetTypes()) 

2064 

2065 # filter out anything from the ignore list 

2066 if ignore: 

2067 ignore = set(ignore) 

2068 datasetTypes = [ 

2069 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2070 ] 

2071 else: 

2072 ignore = set() 

2073 

2074 # For each datasetType that has an instrument dimension, create 

2075 # a DatasetRef for each defined instrument 

2076 datasetRefs = [] 

2077 

2078 # Find all the registered instruments (if "instrument" is in the 

2079 # universe). 

2080 if "instrument" in self.dimensions: 

2081 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2082 

2083 for datasetType in datasetTypes: 

2084 if "instrument" in datasetType.dimensions: 

2085 # In order to create a conforming dataset ref, create 

2086 # fake DataCoordinate values for the non-instrument 

2087 # dimensions. The type of the value does not matter here. 

2088 dataId = {dim: 1 for dim in datasetType.dimensions.names if dim != "instrument"} 

2089 

2090 for instrument in instruments: 

2091 datasetRef = DatasetRef( 

2092 datasetType, 

2093 DataCoordinate.standardize( 

2094 dataId, instrument=instrument, dimensions=datasetType.dimensions 

2095 ), 

2096 run="validate", 

2097 ) 

2098 datasetRefs.append(datasetRef) 

2099 

2100 entities: list[DatasetType | DatasetRef] = [] 

2101 entities.extend(datasetTypes) 

2102 entities.extend(datasetRefs) 

2103 

2104 datastoreErrorStr = None 

2105 try: 

2106 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2107 except ValidationError as e: 

2108 datastoreErrorStr = str(e) 

2109 

2110 # Also check that the LookupKeys used by the datastores match 

2111 # registry and storage class definitions 

2112 keys = self._datastore.getLookupKeys() 

2113 

2114 failedNames = set() 

2115 failedDataId = set() 

2116 for key in keys: 

2117 if key.name is not None: 

2118 if key.name in ignore: 

2119 continue 

2120 

2121 # skip if specific datasetType names were requested and this 

2122 # name does not match 

2123 if datasetTypeNames and key.name not in datasetTypeNames: 

2124 continue 

2125 

2126 # See if it is a StorageClass or a DatasetType 

2127 if key.name in self.storageClasses: 

2128 pass 

2129 else: 

2130 try: 

2131 self.get_dataset_type(key.name) 

2132 except KeyError: 

2133 if logFailures: 

2134 _LOG.critical( 

2135 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2136 ) 

2137 failedNames.add(key) 

2138 else: 

2139 # Dimensions are checked for consistency when the Butler 

2140 # is created and rendezvoused with a universe. 

2141 pass 

2142 

2143 # Check that the instrument is a valid instrument 

2144 # Currently only support instrument so check for that 

2145 if key.dataId: 

2146 dataIdKeys = set(key.dataId) 

2147 if {"instrument"} != dataIdKeys: 

2148 if logFailures: 

2149 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2150 failedDataId.add(key) 

2151 elif key.dataId["instrument"] not in instruments: 

2152 if logFailures: 

2153 _LOG.critical("Key '%s' has unknown instrument", key) 

2154 failedDataId.add(key) 

2155 

2156 messages = [] 

2157 

2158 if datastoreErrorStr: 

2159 messages.append(datastoreErrorStr) 

2160 

2161 for failed, msg in ( 

2162 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2163 (failedDataId, "Keys with bad DataId entries: "), 

2164 ): 

2165 if failed: 

2166 msg += ", ".join(str(k) for k in failed) 

2167 messages.append(msg) 

2168 

2169 if messages: 

2170 raise ValidationError(";\n".join(messages)) 

2171 

2172 @property 

2173 def collections(self) -> Sequence[str]: 

2174 """The collections to search by default, in order 

2175 (`~collections.abc.Sequence` [ `str` ]). 

2176 

2177 This is an alias for ``self.registry.defaults.collections``. It cannot 

2178 be set directly in isolation, but all defaults may be changed together 

2179 by assigning a new `RegistryDefaults` instance to 

2180 ``self.registry.defaults``. 

2181 """ 

2182 return self._registry.defaults.collections 

2183 

2184 @property 

2185 def run(self) -> str | None: 

2186 """Name of the run this butler writes outputs to by default (`str` or 

2187 `None`). 

2188 

2189 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2190 directly in isolation, but all defaults may be changed together by 

2191 assigning a new `RegistryDefaults` instance to 

2192 ``self.registry.defaults``. 

2193 """ 

2194 return self._registry.defaults.run 

2195 

2196 @property 

2197 def registry(self) -> Registry: 

2198 """The object that manages dataset metadata and relationships 

2199 (`Registry`). 

2200 

2201 Many operations that don't involve reading or writing butler datasets 

2202 are accessible only via `Registry` methods. Eventually these methods 

2203 will be replaced by equivalent `Butler` methods. 

2204 """ 

2205 return self._registry_shim 

2206 

2207 @property 

2208 def dimensions(self) -> DimensionUniverse: 

2209 # Docstring inherited. 

2210 return self._registry.dimensions 

2211 

2212 _registry: SqlRegistry 

2213 """The object that manages dataset metadata and relationships 

2214 (`SqlRegistry`). 

2215 

2216 Most operations that don't involve reading or writing butler datasets are 

2217 accessible only via `SqlRegistry` methods. 

2218 """ 

2219 

2220 datastore: Datastore 

2221 """The object that manages actual dataset storage (`Datastore`). 

2222 

2223 Direct user access to the datastore should rarely be necessary; the primary 

2224 exception is the case where a `Datastore` implementation provides extra 

2225 functionality beyond what the base class defines. 

2226 """ 

2227 

2228 storageClasses: StorageClassFactory 

2229 """An object that maps known storage class names to objects that fully 

2230 describe them (`StorageClassFactory`). 

2231 """