Coverage for python/lsst/daf/butler/direct_butler.py: 11%

712 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-04 09:46 +0000

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This software is dual licensed under the GNU General Public License and also 

10# under a 3-clause BSD license. Recipients may choose which of these licenses 

11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, 

12# respectively. If you choose the GPL option then the following text applies 

13# (but note that there is still no warranty even if you opt for BSD instead): 

14# 

15# This program is free software: you can redistribute it and/or modify 

16# it under the terms of the GNU General Public License as published by 

17# the Free Software Foundation, either version 3 of the License, or 

18# (at your option) any later version. 

19# 

20# This program is distributed in the hope that it will be useful, 

21# but WITHOUT ANY WARRANTY; without even the implied warranty of 

22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

23# GNU General Public License for more details. 

24# 

25# You should have received a copy of the GNU General Public License 

26# along with this program. If not, see <http://www.gnu.org/licenses/>. 

27 

28"""Butler top level classes. 

29""" 

30from __future__ import annotations 

31 

32__all__ = ( 

33 "DirectButler", 

34 "ButlerValidationError", 

35) 

36 

37import collections.abc 

38import contextlib 

39import io 

40import logging 

41import numbers 

42import os 

43import warnings 

44from collections import Counter, defaultdict 

45from collections.abc import Iterable, Iterator, MutableMapping, Sequence 

46from typing import TYPE_CHECKING, Any, ClassVar, TextIO 

47 

48from deprecated.sphinx import deprecated 

49from lsst.resources import ResourcePath, ResourcePathExpression 

50from lsst.utils.introspection import get_class_of 

51from lsst.utils.logging import VERBOSE, getLogger 

52from sqlalchemy.exc import IntegrityError 

53 

54from ._butler import Butler 

55from ._butler_config import ButlerConfig 

56from ._config import Config 

57from ._dataset_existence import DatasetExistence 

58from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef 

59from ._dataset_type import DatasetType 

60from ._deferredDatasetHandle import DeferredDatasetHandle 

61from ._exceptions import ValidationError 

62from ._file_dataset import FileDataset 

63from ._limited_butler import LimitedButler 

64from ._registry_shim import RegistryShim 

65from ._storage_class import StorageClass, StorageClassFactory 

66from ._timespan import Timespan 

67from .datastore import DatasetRefURIs, Datastore, NullDatastore 

68from .dimensions import ( 

69 DataCoordinate, 

70 DataId, 

71 DataIdValue, 

72 Dimension, 

73 DimensionElement, 

74 DimensionRecord, 

75 DimensionUniverse, 

76) 

77from .progress import Progress 

78from .registry import ( 

79 CollectionType, 

80 ConflictingDefinitionError, 

81 DataIdError, 

82 MissingDatasetTypeError, 

83 NoDefaultCollectionError, 

84 Registry, 

85 RegistryDefaults, 

86 _RegistryFactory, 

87) 

88from .registry.sql_registry import SqlRegistry 

89from .transfers import RepoExportContext 

90from .utils import transactional 

91 

92if TYPE_CHECKING: 

93 from lsst.resources import ResourceHandleProtocol 

94 

95 from .transfers import RepoImportBackend 

96 

97_LOG = getLogger(__name__) 

98 

99 

100class ButlerValidationError(ValidationError): 

101 """There is a problem with the Butler configuration.""" 

102 

103 pass 

104 

105 

106class DirectButler(Butler): 

107 """Main entry point for the data access system. 

108 

109 Parameters 

110 ---------- 

111 config : `ButlerConfig`, `Config` or `str`, optional. 

112 Configuration. Anything acceptable to the 

113 `ButlerConfig` constructor. If a directory path 

114 is given the configuration will be read from a ``butler.yaml`` file in 

115 that location. If `None` is given default values will be used. 

116 butler : `DirectButler`, optional. 

117 If provided, construct a new Butler that uses the same registry and 

118 datastore as the given one, but with the given collection and run. 

119 Incompatible with the ``config``, ``searchPaths``, and ``writeable`` 

120 arguments. 

121 collections : `str` or `~collections.abc.Iterable` [ `str` ], optional 

122 An expression specifying the collections to be searched (in order) when 

123 reading datasets. 

124 This may be a `str` collection name or an iterable thereof. 

125 See :ref:`daf_butler_collection_expressions` for more information. 

126 These collections are not registered automatically and must be 

127 manually registered before they are used by any method, but they may be 

128 manually registered after the `Butler` is initialized. 

129 run : `str`, optional 

130 Name of the `~CollectionType.RUN` collection new datasets should be 

131 inserted into. If ``collections`` is `None` and ``run`` is not `None`, 

132 ``collections`` will be set to ``[run]``. If not `None`, this 

133 collection will automatically be registered. If this is not set (and 

134 ``writeable`` is not set either), a read-only butler will be created. 

135 searchPaths : `list` of `str`, optional 

136 Directory paths to search when calculating the full Butler 

137 configuration. Not used if the supplied config is already a 

138 `ButlerConfig`. 

139 writeable : `bool`, optional 

140 Explicitly sets whether the butler supports write operations. If not 

141 provided, a read-write butler is created if any of ``run``, ``tags``, 

142 or ``chains`` is non-empty. 

143 inferDefaults : `bool`, optional 

144 If `True` (default) infer default data ID values from the values 

145 present in the datasets in ``collections``: if all collections have the 

146 same value (or no value) for a governor dimension, that value will be 

147 the default for that dimension. Nonexistent collections are ignored. 

148 If a default value is provided explicitly for a governor dimension via 

149 ``**kwargs``, no default will be inferred for that dimension. 

150 without_datastore : `bool`, optional 

151 If `True` do not attach a datastore to this butler. Any attempts 

152 to use a datastore will fail. 

153 **kwargs : `str` 

154 Default data ID key-value pairs. These may only identify "governor" 

155 dimensions like ``instrument`` and ``skymap``. 

156 """ 

157 

158 def __init__( 

159 self, 

160 config: Config | ResourcePathExpression | None = None, 

161 *, 

162 butler: DirectButler | None = None, 

163 collections: Any = None, 

164 run: str | None = None, 

165 searchPaths: Sequence[ResourcePathExpression] | None = None, 

166 writeable: bool | None = None, 

167 inferDefaults: bool = True, 

168 without_datastore: bool = False, 

169 **kwargs: str, 

170 ): 

171 defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) 

172 # Load registry, datastore, etc. from config or existing butler. 

173 if butler is not None: 

174 if config is not None or searchPaths is not None or writeable is not None: 

175 raise TypeError( 

176 "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." 

177 ) 

178 self._registry = butler._registry.copy(defaults) 

179 self._datastore = butler._datastore 

180 self.storageClasses = butler.storageClasses 

181 self._config: ButlerConfig = butler._config 

182 else: 

183 self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore) 

184 try: 

185 butlerRoot = self._config.get("root", self._config.configDir) 

186 if writeable is None: 

187 writeable = run is not None 

188 self._registry = _RegistryFactory(self._config).from_config( 

189 butlerRoot=butlerRoot, writeable=writeable, defaults=defaults 

190 ) 

191 if without_datastore: 

192 self._datastore = NullDatastore(None, None) 

193 else: 

194 self._datastore = Datastore.fromConfig( 

195 self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot 

196 ) 

197 # TODO: Once datastore drops dependency on registry we can 

198 # construct datastore first and pass opaque tables to registry 

199 # constructor. 

200 self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions()) 

201 self.storageClasses = StorageClassFactory() 

202 self.storageClasses.addFromConfig(self._config) 

203 except Exception: 

204 # Failures here usually mean that configuration is incomplete, 

205 # just issue an error message which includes config file URI. 

206 _LOG.error(f"Failed to instantiate Butler from config {self._config.configFile}.") 

207 raise 

208 

209 # For execution butler the datastore needs a special 

210 # dependency-inversion trick. This is not used by regular butler, 

211 # but we do not have a way to distinguish regular butler from execution 

212 # butler. 

213 self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) 

214 

215 if "run" in self._config or "collection" in self._config: 

216 raise ValueError("Passing a run or collection via configuration is no longer supported.") 

217 

218 self._registry_shim = RegistryShim(self) 

219 

220 GENERATION: ClassVar[int] = 3 

221 """This is a Generation 3 Butler. 

222 

223 This attribute may be removed in the future, once the Generation 2 Butler 

224 interface has been fully retired; it should only be used in transitional 

225 code. 

226 """ 

227 

228 def _retrieve_dataset_type(self, name: str) -> DatasetType | None: 

229 """Return DatasetType defined in registry given dataset type name.""" 

230 try: 

231 return self.get_dataset_type(name) 

232 except MissingDatasetTypeError: 

233 return None 

234 

235 @classmethod 

236 def _unpickle( 

237 cls, 

238 config: ButlerConfig, 

239 collections: tuple[str, ...] | None, 

240 run: str | None, 

241 defaultDataId: dict[str, str], 

242 writeable: bool, 

243 ) -> DirectButler: 

244 """Callable used to unpickle a Butler. 

245 

246 We prefer not to use ``Butler.__init__`` directly so we can force some 

247 of its many arguments to be keyword-only (note that ``__reduce__`` 

248 can only invoke callables with positional arguments). 

249 

250 Parameters 

251 ---------- 

252 config : `ButlerConfig` 

253 Butler configuration, already coerced into a true `ButlerConfig` 

254 instance (and hence after any search paths for overrides have been 

255 utilized). 

256 collections : `tuple` [ `str` ] 

257 Names of the default collections to read from. 

258 run : `str`, optional 

259 Name of the default `~CollectionType.RUN` collection to write to. 

260 defaultDataId : `dict` [ `str`, `str` ] 

261 Default data ID values. 

262 writeable : `bool` 

263 Whether the Butler should support write operations. 

264 

265 Returns 

266 ------- 

267 butler : `Butler` 

268 A new `Butler` instance. 

269 """ 

270 # MyPy doesn't recognize that the kwargs below are totally valid; it 

271 # seems to think '**defaultDataId* is a _positional_ argument! 

272 return cls( 

273 config=config, 

274 collections=collections, 

275 run=run, 

276 writeable=writeable, 

277 **defaultDataId, # type: ignore 

278 ) 

279 

280 def __reduce__(self) -> tuple: 

281 """Support pickling.""" 

282 return ( 

283 DirectButler._unpickle, 

284 ( 

285 self._config, 

286 self.collections, 

287 self.run, 

288 self._registry.defaults.dataId.byName(), 

289 self._registry.isWriteable(), 

290 ), 

291 ) 

292 

293 def __str__(self) -> str: 

294 return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( 

295 self.collections, self.run, self._datastore, self._registry 

296 ) 

297 

298 def isWriteable(self) -> bool: 

299 # Docstring inherited. 

300 return self._registry.isWriteable() 

301 

302 @contextlib.contextmanager 

303 def transaction(self) -> Iterator[None]: 

304 """Context manager supporting `Butler` transactions. 

305 

306 Transactions can be nested. 

307 """ 

308 with self._registry.transaction(), self._datastore.transaction(): 

309 yield 

310 

311 def _standardizeArgs( 

312 self, 

313 datasetRefOrType: DatasetRef | DatasetType | str, 

314 dataId: DataId | None = None, 

315 for_put: bool = True, 

316 **kwargs: Any, 

317 ) -> tuple[DatasetType, DataId | None]: 

318 """Standardize the arguments passed to several Butler APIs. 

319 

320 Parameters 

321 ---------- 

322 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

323 When `DatasetRef` the `dataId` should be `None`. 

324 Otherwise the `DatasetType` or name thereof. 

325 dataId : `dict` or `DataCoordinate` 

326 A `dict` of `Dimension` link name, value pairs that label the 

327 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

328 should be provided as the second argument. 

329 for_put : `bool`, optional 

330 If `True` this call is invoked as part of a `Butler.put()`. 

331 Otherwise it is assumed to be part of a `Butler.get()`. This 

332 parameter is only relevant if there is dataset type 

333 inconsistency. 

334 **kwargs 

335 Additional keyword arguments used to augment or construct a 

336 `DataCoordinate`. See `DataCoordinate.standardize` 

337 parameters. 

338 

339 Returns 

340 ------- 

341 datasetType : `DatasetType` 

342 A `DatasetType` instance extracted from ``datasetRefOrType``. 

343 dataId : `dict` or `DataId`, optional 

344 Argument that can be used (along with ``kwargs``) to construct a 

345 `DataId`. 

346 

347 Notes 

348 ----- 

349 Butler APIs that conceptually need a DatasetRef also allow passing a 

350 `DatasetType` (or the name of one) and a `DataId` (or a dict and 

351 keyword arguments that can be used to construct one) separately. This 

352 method accepts those arguments and always returns a true `DatasetType` 

353 and a `DataId` or `dict`. 

354 

355 Standardization of `dict` vs `DataId` is best handled by passing the 

356 returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are 

357 generally similarly flexible. 

358 """ 

359 externalDatasetType: DatasetType | None = None 

360 internalDatasetType: DatasetType | None = None 

361 if isinstance(datasetRefOrType, DatasetRef): 

362 if dataId is not None or kwargs: 

363 raise ValueError("DatasetRef given, cannot use dataId as well") 

364 externalDatasetType = datasetRefOrType.datasetType 

365 dataId = datasetRefOrType.dataId 

366 else: 

367 # Don't check whether DataId is provided, because Registry APIs 

368 # can usually construct a better error message when it wasn't. 

369 if isinstance(datasetRefOrType, DatasetType): 

370 externalDatasetType = datasetRefOrType 

371 else: 

372 internalDatasetType = self.get_dataset_type(datasetRefOrType) 

373 

374 # Check that they are self-consistent 

375 if externalDatasetType is not None: 

376 internalDatasetType = self.get_dataset_type(externalDatasetType.name) 

377 if externalDatasetType != internalDatasetType: 

378 # We can allow differences if they are compatible, depending 

379 # on whether this is a get or a put. A get requires that 

380 # the python type associated with the datastore can be 

381 # converted to the user type. A put requires that the user 

382 # supplied python type can be converted to the internal 

383 # type expected by registry. 

384 relevantDatasetType = internalDatasetType 

385 if for_put: 

386 is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) 

387 else: 

388 is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) 

389 relevantDatasetType = externalDatasetType 

390 if not is_compatible: 

391 raise ValueError( 

392 f"Supplied dataset type ({externalDatasetType}) inconsistent with " 

393 f"registry definition ({internalDatasetType})" 

394 ) 

395 # Override the internal definition. 

396 internalDatasetType = relevantDatasetType 

397 

398 assert internalDatasetType is not None 

399 return internalDatasetType, dataId 

400 

401 def _rewrite_data_id( 

402 self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any 

403 ) -> tuple[DataId | None, dict[str, Any]]: 

404 """Rewrite a data ID taking into account dimension records. 

405 

406 Take a Data ID and keyword args and rewrite it if necessary to 

407 allow the user to specify dimension records rather than dimension 

408 primary values. 

409 

410 This allows a user to include a dataId dict with keys of 

411 ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving 

412 the integer exposure ID. It also allows a string to be given 

413 for a dimension value rather than the integer ID if that is more 

414 convenient. For example, rather than having to specifying the 

415 detector with ``detector.full_name``, a string given for ``detector`` 

416 will be interpreted as the full name and converted to the integer 

417 value. 

418 

419 Keyword arguments can also use strings for dimensions like detector 

420 and exposure but python does not allow them to include ``.`` and 

421 so the ``exposure.day_obs`` syntax can not be used in a keyword 

422 argument. 

423 

424 Parameters 

425 ---------- 

426 dataId : `dict` or `DataCoordinate` 

427 A `dict` of `Dimension` link name, value pairs that will label the 

428 `DatasetRef` within a Collection. 

429 datasetType : `DatasetType` 

430 The dataset type associated with this dataId. Required to 

431 determine the relevant dimensions. 

432 **kwargs 

433 Additional keyword arguments used to augment or construct a 

434 `DataId`. See `DataId` parameters. 

435 

436 Returns 

437 ------- 

438 dataId : `dict` or `DataCoordinate` 

439 The, possibly rewritten, dataId. If given a `DataCoordinate` and 

440 no keyword arguments, the original dataId will be returned 

441 unchanged. 

442 **kwargs : `dict` 

443 Any unused keyword arguments (would normally be empty dict). 

444 """ 

445 # Do nothing if we have a standalone DataCoordinate. 

446 if isinstance(dataId, DataCoordinate) and not kwargs: 

447 return dataId, kwargs 

448 

449 # Process dimension records that are using record information 

450 # rather than ids 

451 newDataId: dict[str, DataIdValue] = {} 

452 byRecord: dict[str, dict[str, Any]] = defaultdict(dict) 

453 

454 # if all the dataId comes from keyword parameters we do not need 

455 # to do anything here because they can't be of the form 

456 # exposure.obs_id because a "." is not allowed in a keyword parameter. 

457 if dataId: 

458 for k, v in dataId.items(): 

459 # If we have a Dimension we do not need to do anything 

460 # because it cannot be a compound key. 

461 if isinstance(k, str) and "." in k: 

462 # Someone is using a more human-readable dataId 

463 dimensionName, record = k.split(".", 1) 

464 byRecord[dimensionName][record] = v 

465 elif isinstance(k, Dimension): 

466 newDataId[k.name] = v 

467 else: 

468 newDataId[k] = v 

469 

470 # Go through the updated dataId and check the type in case someone is 

471 # using an alternate key. We have already filtered out the compound 

472 # keys dimensions.record format. 

473 not_dimensions = {} 

474 

475 # Will need to look in the dataId and the keyword arguments 

476 # and will remove them if they need to be fixed or are unrecognized. 

477 for dataIdDict in (newDataId, kwargs): 

478 # Use a list so we can adjust the dict safely in the loop 

479 for dimensionName in list(dataIdDict): 

480 value = dataIdDict[dimensionName] 

481 try: 

482 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

483 except KeyError: 

484 # This is not a real dimension 

485 not_dimensions[dimensionName] = value 

486 del dataIdDict[dimensionName] 

487 continue 

488 

489 # Convert an integral type to an explicit int to simplify 

490 # comparisons here 

491 if isinstance(value, numbers.Integral): 

492 value = int(value) 

493 

494 if not isinstance(value, dimension.primaryKey.getPythonType()): 

495 for alternate in dimension.alternateKeys: 

496 if isinstance(value, alternate.getPythonType()): 

497 byRecord[dimensionName][alternate.name] = value 

498 del dataIdDict[dimensionName] 

499 _LOG.debug( 

500 "Converting dimension %s to %s.%s=%s", 

501 dimensionName, 

502 dimensionName, 

503 alternate.name, 

504 value, 

505 ) 

506 break 

507 else: 

508 _LOG.warning( 

509 "Type mismatch found for value '%r' provided for dimension %s. " 

510 "Could not find matching alternative (primary key has type %s) " 

511 "so attempting to use as-is.", 

512 value, 

513 dimensionName, 

514 dimension.primaryKey.getPythonType(), 

515 ) 

516 

517 # By this point kwargs and newDataId should only include valid 

518 # dimensions. Merge kwargs in to the new dataId and log if there 

519 # are dimensions in both (rather than calling update). 

520 for k, v in kwargs.items(): 

521 if k in newDataId and newDataId[k] != v: 

522 _LOG.debug( 

523 "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v 

524 ) 

525 newDataId[k] = v 

526 # No need to retain any values in kwargs now. 

527 kwargs = {} 

528 

529 # If we have some unrecognized dimensions we have to try to connect 

530 # them to records in other dimensions. This is made more complicated 

531 # by some dimensions having records with clashing names. A mitigation 

532 # is that we can tell by this point which dimensions are missing 

533 # for the DatasetType but this does not work for calibrations 

534 # where additional dimensions can be used to constrain the temporal 

535 # axis. 

536 if not_dimensions: 

537 # Search for all dimensions even if we have been given a value 

538 # explicitly. In some cases records are given as well as the 

539 # actually dimension and this should not be an error if they 

540 # match. 

541 mandatoryDimensions = datasetType.dimensions.names # - provided 

542 

543 candidateDimensions: set[str] = set() 

544 candidateDimensions.update(mandatoryDimensions) 

545 

546 # For calibrations we may well be needing temporal dimensions 

547 # so rather than always including all dimensions in the scan 

548 # restrict things a little. It is still possible for there 

549 # to be confusion over day_obs in visit vs exposure for example. 

550 # If we are not searching calibration collections things may 

551 # fail but they are going to fail anyway because of the 

552 # ambiguousness of the dataId... 

553 if datasetType.isCalibration(): 

554 for dim in self.dimensions.getStaticDimensions(): 

555 if dim.temporal: 

556 candidateDimensions.add(str(dim)) 

557 

558 # Look up table for the first association with a dimension 

559 guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) 

560 

561 # Keep track of whether an item is associated with multiple 

562 # dimensions. 

563 counter: Counter[str] = Counter() 

564 assigned: dict[str, set[str]] = defaultdict(set) 

565 

566 # Go through the missing dimensions and associate the 

567 # given names with records within those dimensions 

568 matched_dims = set() 

569 for dimensionName in candidateDimensions: 

570 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

571 fields = dimension.metadata.names | dimension.uniqueKeys.names 

572 for field in not_dimensions: 

573 if field in fields: 

574 guessedAssociation[dimensionName][field] = not_dimensions[field] 

575 counter[dimensionName] += 1 

576 assigned[field].add(dimensionName) 

577 matched_dims.add(field) 

578 

579 # Calculate the fields that matched nothing. 

580 never_found = set(not_dimensions) - matched_dims 

581 

582 if never_found: 

583 raise ValueError(f"Unrecognized keyword args given: {never_found}") 

584 

585 # There is a chance we have allocated a single dataId item 

586 # to multiple dimensions. Need to decide which should be retained. 

587 # For now assume that the most popular alternative wins. 

588 # This means that day_obs with seq_num will result in 

589 # exposure.day_obs and not visit.day_obs 

590 # Also prefer an explicitly missing dimension over an inferred 

591 # temporal dimension. 

592 for fieldName, assignedDimensions in assigned.items(): 

593 if len(assignedDimensions) > 1: 

594 # Pick the most popular (preferring mandatory dimensions) 

595 requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) 

596 if requiredButMissing: 

597 candidateDimensions = requiredButMissing 

598 else: 

599 candidateDimensions = assignedDimensions 

600 

601 # If this is a choice between visit and exposure and 

602 # neither was a required part of the dataset type, 

603 # (hence in this branch) always prefer exposure over 

604 # visit since exposures are always defined and visits 

605 # are defined from exposures. 

606 if candidateDimensions == {"exposure", "visit"}: 

607 candidateDimensions = {"exposure"} 

608 

609 # Select the relevant items and get a new restricted 

610 # counter. 

611 theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} 

612 duplicatesCounter: Counter[str] = Counter() 

613 duplicatesCounter.update(theseCounts) 

614 

615 # Choose the most common. If they are equally common 

616 # we will pick the one that was found first. 

617 # Returns a list of tuples 

618 selected = duplicatesCounter.most_common(1)[0][0] 

619 

620 _LOG.debug( 

621 "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." 

622 " Removed ambiguity by choosing dimension %s.", 

623 fieldName, 

624 ", ".join(assignedDimensions), 

625 selected, 

626 ) 

627 

628 for candidateDimension in assignedDimensions: 

629 if candidateDimension != selected: 

630 del guessedAssociation[candidateDimension][fieldName] 

631 

632 # Update the record look up dict with the new associations 

633 for dimensionName, values in guessedAssociation.items(): 

634 if values: # A dict might now be empty 

635 _LOG.debug( 

636 "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values 

637 ) 

638 byRecord[dimensionName].update(values) 

639 

640 if byRecord: 

641 # Some record specifiers were found so we need to convert 

642 # them to the Id form 

643 for dimensionName, values in byRecord.items(): 

644 if dimensionName in newDataId: 

645 _LOG.debug( 

646 "DataId specified explicit %s dimension value of %s in addition to" 

647 " general record specifiers for it of %s. Ignoring record information.", 

648 dimensionName, 

649 newDataId[dimensionName], 

650 str(values), 

651 ) 

652 # Get the actual record and compare with these values. 

653 try: 

654 recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) 

655 except DataIdError: 

656 raise ValueError( 

657 f"Could not find dimension '{dimensionName}'" 

658 f" with dataId {newDataId} as part of comparing with" 

659 f" record values {byRecord[dimensionName]}" 

660 ) from None 

661 if len(recs) == 1: 

662 errmsg: list[str] = [] 

663 for k, v in values.items(): 

664 if (recval := getattr(recs[0], k)) != v: 

665 errmsg.append(f"{k}({recval} != {v})") 

666 if errmsg: 

667 raise ValueError( 

668 f"Dimension {dimensionName} in dataId has explicit value" 

669 " inconsistent with records: " + ", ".join(errmsg) 

670 ) 

671 else: 

672 # Multiple matches for an explicit dimension 

673 # should never happen but let downstream complain. 

674 pass 

675 continue 

676 

677 # Build up a WHERE expression 

678 bind = dict(values.items()) 

679 where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) 

680 

681 # Hopefully we get a single record that matches 

682 records = set( 

683 self._registry.queryDimensionRecords( 

684 dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs 

685 ) 

686 ) 

687 

688 if len(records) != 1: 

689 if len(records) > 1: 

690 # visit can have an ambiguous answer without involving 

691 # visit_system. The default visit_system is defined 

692 # by the instrument. 

693 if ( 

694 dimensionName == "visit" 

695 and "visit_system_membership" in self.dimensions 

696 and "visit_system" in self.dimensions["instrument"].metadata 

697 ): 

698 instrument_records = list( 

699 self._registry.queryDimensionRecords( 

700 "instrument", 

701 dataId=newDataId, 

702 **kwargs, 

703 ) 

704 ) 

705 if len(instrument_records) == 1: 

706 visit_system = instrument_records[0].visit_system 

707 if visit_system is None: 

708 # Set to a value that will never match. 

709 visit_system = -1 

710 

711 # Look up each visit in the 

712 # visit_system_membership records. 

713 for rec in records: 

714 membership = list( 

715 self._registry.queryDimensionRecords( 

716 # Use bind to allow zero results. 

717 # This is a fully-specified query. 

718 "visit_system_membership", 

719 where="instrument = inst AND visit_system = system AND visit = v", 

720 bind=dict( 

721 inst=instrument_records[0].name, system=visit_system, v=rec.id 

722 ), 

723 ) 

724 ) 

725 if membership: 

726 # This record is the right answer. 

727 records = {rec} 

728 break 

729 

730 # The ambiguity may have been resolved so check again. 

731 if len(records) > 1: 

732 _LOG.debug( 

733 "Received %d records from constraints of %s", len(records), str(values) 

734 ) 

735 for r in records: 

736 _LOG.debug("- %s", str(r)) 

737 raise ValueError( 

738 f"DataId specification for dimension {dimensionName} is not" 

739 f" uniquely constrained to a single dataset by {values}." 

740 f" Got {len(records)} results." 

741 ) 

742 else: 

743 raise ValueError( 

744 f"DataId specification for dimension {dimensionName} matched no" 

745 f" records when constrained by {values}" 

746 ) 

747 

748 # Get the primary key from the real dimension object 

749 dimension = self.dimensions.getStaticDimensions()[dimensionName] 

750 if not isinstance(dimension, Dimension): 

751 raise RuntimeError( 

752 f"{dimension.name} is not a true dimension, and cannot be used in data IDs." 

753 ) 

754 newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) 

755 

756 return newDataId, kwargs 

757 

758 def _findDatasetRef( 

759 self, 

760 datasetRefOrType: DatasetRef | DatasetType | str, 

761 dataId: DataId | None = None, 

762 *, 

763 collections: Any = None, 

764 predict: bool = False, 

765 run: str | None = None, 

766 datastore_records: bool = False, 

767 **kwargs: Any, 

768 ) -> DatasetRef: 

769 """Shared logic for methods that start with a search for a dataset in 

770 the registry. 

771 

772 Parameters 

773 ---------- 

774 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

775 When `DatasetRef` the `dataId` should be `None`. 

776 Otherwise the `DatasetType` or name thereof. 

777 dataId : `dict` or `DataCoordinate`, optional 

778 A `dict` of `Dimension` link name, value pairs that label the 

779 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

780 should be provided as the first argument. 

781 collections : Any, optional 

782 Collections to be searched, overriding ``self.collections``. 

783 Can be any of the types supported by the ``collections`` argument 

784 to butler construction. 

785 predict : `bool`, optional 

786 If `True`, return a newly created `DatasetRef` with a unique 

787 dataset ID if finding a reference in the `Registry` fails. 

788 Defaults to `False`. 

789 run : `str`, optional 

790 Run collection name to use for creating `DatasetRef` for predicted 

791 datasets. Only used if ``predict`` is `True`. 

792 datastore_records : `bool`, optional 

793 If `True` add datastore records to returned `DatasetRef`. 

794 **kwargs 

795 Additional keyword arguments used to augment or construct a 

796 `DataId`. See `DataId` parameters. 

797 

798 Returns 

799 ------- 

800 ref : `DatasetRef` 

801 A reference to the dataset identified by the given arguments. 

802 This can be the same dataset reference as given if it was 

803 resolved. 

804 

805 Raises 

806 ------ 

807 LookupError 

808 Raised if no matching dataset exists in the `Registry` (and 

809 ``predict`` is `False`). 

810 ValueError 

811 Raised if a resolved `DatasetRef` was passed as an input, but it 

812 differs from the one found in the registry. 

813 TypeError 

814 Raised if no collections were provided. 

815 """ 

816 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) 

817 if isinstance(datasetRefOrType, DatasetRef): 

818 if collections is not None: 

819 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) 

820 # May need to retrieve datastore records if requested. 

821 if datastore_records and datasetRefOrType._datastore_records is None: 

822 datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) 

823 return datasetRefOrType 

824 timespan: Timespan | None = None 

825 

826 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

827 

828 if datasetType.isCalibration(): 

829 # Because this is a calibration dataset, first try to make a 

830 # standardize the data ID without restricting the dimensions to 

831 # those of the dataset type requested, because there may be extra 

832 # dimensions that provide temporal information for a validity-range 

833 # lookup. 

834 dataId = DataCoordinate.standardize( 

835 dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

836 ) 

837 if dataId.graph.temporal: 

838 dataId = self._registry.expandDataId(dataId) 

839 timespan = dataId.timespan 

840 else: 

841 # Standardize the data ID to just the dimensions of the dataset 

842 # type instead of letting registry.findDataset do it, so we get the 

843 # result even if no dataset is found. 

844 dataId = DataCoordinate.standardize( 

845 dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs 

846 ) 

847 # Always lookup the DatasetRef, even if one is given, to ensure it is 

848 # present in the current collection. 

849 ref = self.find_dataset( 

850 datasetType, 

851 dataId, 

852 collections=collections, 

853 timespan=timespan, 

854 datastore_records=datastore_records, 

855 ) 

856 if ref is None: 

857 if predict: 

858 if run is None: 

859 run = self.run 

860 if run is None: 

861 raise TypeError("Cannot predict dataset ID/location with run=None.") 

862 return DatasetRef(datasetType, dataId, run=run) 

863 else: 

864 if collections is None: 

865 collections = self._registry.defaults.collections 

866 raise LookupError( 

867 f"Dataset {datasetType.name} with data ID {dataId} " 

868 f"could not be found in collections {collections}." 

869 ) 

870 if datasetType != ref.datasetType: 

871 # If they differ it is because the user explicitly specified 

872 # a compatible dataset type to this call rather than using the 

873 # registry definition. The DatasetRef must therefore be recreated 

874 # using the user definition such that the expected type is 

875 # returned. 

876 ref = DatasetRef( 

877 datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records 

878 ) 

879 

880 return ref 

881 

882 # TODO: remove on DM-40067. 

883 @transactional 

884 @deprecated( 

885 reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." 

886 " Please use Butler.put(). Be aware that you may need to adjust your usage if you" 

887 " were relying on the run parameter to determine the run." 

888 " Will be removed after v26.0.", 

889 version="v26.0", 

890 category=FutureWarning, 

891 ) 

892 def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: 

893 # Docstring inherited. 

894 return self.put(obj, ref) 

895 

896 @transactional 

897 def put( 

898 self, 

899 obj: Any, 

900 datasetRefOrType: DatasetRef | DatasetType | str, 

901 /, 

902 dataId: DataId | None = None, 

903 *, 

904 run: str | None = None, 

905 **kwargs: Any, 

906 ) -> DatasetRef: 

907 """Store and register a dataset. 

908 

909 Parameters 

910 ---------- 

911 obj : `object` 

912 The dataset. 

913 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

914 When `DatasetRef` is provided, ``dataId`` should be `None`. 

915 Otherwise the `DatasetType` or name thereof. If a fully resolved 

916 `DatasetRef` is given the run and ID are used directly. 

917 dataId : `dict` or `DataCoordinate` 

918 A `dict` of `Dimension` link name, value pairs that label the 

919 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

920 should be provided as the second argument. 

921 run : `str`, optional 

922 The name of the run the dataset should be added to, overriding 

923 ``self.run``. Not used if a resolved `DatasetRef` is provided. 

924 **kwargs 

925 Additional keyword arguments used to augment or construct a 

926 `DataCoordinate`. See `DataCoordinate.standardize` 

927 parameters. Not used if a resolve `DatasetRef` is provided. 

928 

929 Returns 

930 ------- 

931 ref : `DatasetRef` 

932 A reference to the stored dataset, updated with the correct id if 

933 given. 

934 

935 Raises 

936 ------ 

937 TypeError 

938 Raised if the butler is read-only or if no run has been provided. 

939 """ 

940 if isinstance(datasetRefOrType, DatasetRef): 

941 # This is a direct put of predefined DatasetRef. 

942 _LOG.debug("Butler put direct: %s", datasetRefOrType) 

943 if run is not None: 

944 warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) 

945 # If registry already has a dataset with the same dataset ID, 

946 # dataset type and DataId, then _importDatasets will do nothing and 

947 # just return an original ref. We have to raise in this case, there 

948 # is a datastore check below for that. 

949 self._registry._importDatasets([datasetRefOrType], expand=True) 

950 # Before trying to write to the datastore check that it does not 

951 # know this dataset. This is prone to races, of course. 

952 if self._datastore.knows(datasetRefOrType): 

953 raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") 

954 # Try to write dataset to the datastore, if it fails due to a race 

955 # with another write, the content of stored data may be 

956 # unpredictable. 

957 try: 

958 self._datastore.put(obj, datasetRefOrType) 

959 except IntegrityError as e: 

960 raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e 

961 return datasetRefOrType 

962 

963 _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) 

964 if not self.isWriteable(): 

965 raise TypeError("Butler is read-only.") 

966 datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) 

967 

968 # Handle dimension records in dataId 

969 dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) 

970 

971 # Add Registry Dataset entry. 

972 dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) 

973 (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) 

974 self._datastore.put(obj, ref) 

975 

976 return ref 

977 

978 # TODO: remove on DM-40067. 

979 @deprecated( 

980 reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." 

981 " Please use Butler.get(). Will be removed after v26.0.", 

982 version="v26.0", 

983 category=FutureWarning, 

984 ) 

985 def getDirect( 

986 self, 

987 ref: DatasetRef, 

988 *, 

989 parameters: dict[str, Any] | None = None, 

990 storageClass: StorageClass | str | None = None, 

991 ) -> Any: 

992 """Retrieve a stored dataset. 

993 

994 Parameters 

995 ---------- 

996 ref : `DatasetRef` 

997 Resolved reference to an already stored dataset. 

998 parameters : `dict` 

999 Additional StorageClass-defined options to control reading, 

1000 typically used to efficiently read only a subset of the dataset. 

1001 storageClass : `StorageClass` or `str`, optional 

1002 The storage class to be used to override the Python type 

1003 returned by this method. By default the returned type matches 

1004 the dataset type definition for this dataset. Specifying a 

1005 read `StorageClass` can force a different type to be returned. 

1006 This type must be compatible with the original type. 

1007 

1008 Returns 

1009 ------- 

1010 obj : `object` 

1011 The dataset. 

1012 """ 

1013 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1014 

1015 # TODO: remove on DM-40067. 

1016 @deprecated( 

1017 reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " 

1018 "Please use Butler.getDeferred(). Will be removed after v26.0.", 

1019 version="v26.0", 

1020 category=FutureWarning, 

1021 ) 

1022 def getDirectDeferred( 

1023 self, 

1024 ref: DatasetRef, 

1025 *, 

1026 parameters: dict[str, Any] | None = None, 

1027 storageClass: str | StorageClass | None = None, 

1028 ) -> DeferredDatasetHandle: 

1029 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1030 from a resolved `DatasetRef`. 

1031 

1032 Parameters 

1033 ---------- 

1034 ref : `DatasetRef` 

1035 Resolved reference to an already stored dataset. 

1036 parameters : `dict` 

1037 Additional StorageClass-defined options to control reading, 

1038 typically used to efficiently read only a subset of the dataset. 

1039 storageClass : `StorageClass` or `str`, optional 

1040 The storage class to be used to override the Python type 

1041 returned by this method. By default the returned type matches 

1042 the dataset type definition for this dataset. Specifying a 

1043 read `StorageClass` can force a different type to be returned. 

1044 This type must be compatible with the original type. 

1045 

1046 Returns 

1047 ------- 

1048 obj : `DeferredDatasetHandle` 

1049 A handle which can be used to retrieve a dataset at a later time. 

1050 

1051 Raises 

1052 ------ 

1053 LookupError 

1054 Raised if no matching dataset exists in the `Registry`. 

1055 """ 

1056 # Check that dataset is known to the datastore. 

1057 if not self._datastore.knows(ref): 

1058 raise LookupError(f"Dataset reference {ref} is not known to datastore.") 

1059 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1060 

1061 def getDeferred( 

1062 self, 

1063 datasetRefOrType: DatasetRef | DatasetType | str, 

1064 /, 

1065 dataId: DataId | None = None, 

1066 *, 

1067 parameters: dict | None = None, 

1068 collections: Any = None, 

1069 storageClass: str | StorageClass | None = None, 

1070 **kwargs: Any, 

1071 ) -> DeferredDatasetHandle: 

1072 """Create a `DeferredDatasetHandle` which can later retrieve a dataset, 

1073 after an immediate registry lookup. 

1074 

1075 Parameters 

1076 ---------- 

1077 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1078 When `DatasetRef` the `dataId` should be `None`. 

1079 Otherwise the `DatasetType` or name thereof. 

1080 dataId : `dict` or `DataCoordinate`, optional 

1081 A `dict` of `Dimension` link name, value pairs that label the 

1082 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1083 should be provided as the first argument. 

1084 parameters : `dict` 

1085 Additional StorageClass-defined options to control reading, 

1086 typically used to efficiently read only a subset of the dataset. 

1087 collections : Any, optional 

1088 Collections to be searched, overriding ``self.collections``. 

1089 Can be any of the types supported by the ``collections`` argument 

1090 to butler construction. 

1091 storageClass : `StorageClass` or `str`, optional 

1092 The storage class to be used to override the Python type 

1093 returned by this method. By default the returned type matches 

1094 the dataset type definition for this dataset. Specifying a 

1095 read `StorageClass` can force a different type to be returned. 

1096 This type must be compatible with the original type. 

1097 **kwargs 

1098 Additional keyword arguments used to augment or construct a 

1099 `DataId`. See `DataId` parameters. 

1100 

1101 Returns 

1102 ------- 

1103 obj : `DeferredDatasetHandle` 

1104 A handle which can be used to retrieve a dataset at a later time. 

1105 

1106 Raises 

1107 ------ 

1108 LookupError 

1109 Raised if no matching dataset exists in the `Registry` or 

1110 datastore. 

1111 ValueError 

1112 Raised if a resolved `DatasetRef` was passed as an input, but it 

1113 differs from the one found in the registry. 

1114 TypeError 

1115 Raised if no collections were provided. 

1116 """ 

1117 if isinstance(datasetRefOrType, DatasetRef): 

1118 # Do the quick check first and if that fails, check for artifact 

1119 # existence. This is necessary for datastores that are configured 

1120 # in trust mode where there won't be a record but there will be 

1121 # a file. 

1122 if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): 

1123 ref = datasetRefOrType 

1124 else: 

1125 raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") 

1126 else: 

1127 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1128 return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) 

1129 

1130 def get( 

1131 self, 

1132 datasetRefOrType: DatasetRef | DatasetType | str, 

1133 /, 

1134 dataId: DataId | None = None, 

1135 *, 

1136 parameters: dict[str, Any] | None = None, 

1137 collections: Any = None, 

1138 storageClass: StorageClass | str | None = None, 

1139 **kwargs: Any, 

1140 ) -> Any: 

1141 """Retrieve a stored dataset. 

1142 

1143 Parameters 

1144 ---------- 

1145 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1146 When `DatasetRef` the `dataId` should be `None`. 

1147 Otherwise the `DatasetType` or name thereof. 

1148 If a resolved `DatasetRef`, the associated dataset 

1149 is returned directly without additional querying. 

1150 dataId : `dict` or `DataCoordinate` 

1151 A `dict` of `Dimension` link name, value pairs that label the 

1152 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1153 should be provided as the first argument. 

1154 parameters : `dict` 

1155 Additional StorageClass-defined options to control reading, 

1156 typically used to efficiently read only a subset of the dataset. 

1157 collections : Any, optional 

1158 Collections to be searched, overriding ``self.collections``. 

1159 Can be any of the types supported by the ``collections`` argument 

1160 to butler construction. 

1161 storageClass : `StorageClass` or `str`, optional 

1162 The storage class to be used to override the Python type 

1163 returned by this method. By default the returned type matches 

1164 the dataset type definition for this dataset. Specifying a 

1165 read `StorageClass` can force a different type to be returned. 

1166 This type must be compatible with the original type. 

1167 **kwargs 

1168 Additional keyword arguments used to augment or construct a 

1169 `DataCoordinate`. See `DataCoordinate.standardize` 

1170 parameters. 

1171 

1172 Returns 

1173 ------- 

1174 obj : `object` 

1175 The dataset. 

1176 

1177 Raises 

1178 ------ 

1179 LookupError 

1180 Raised if no matching dataset exists in the `Registry`. 

1181 TypeError 

1182 Raised if no collections were provided. 

1183 

1184 Notes 

1185 ----- 

1186 When looking up datasets in a `~CollectionType.CALIBRATION` collection, 

1187 this method requires that the given data ID include temporal dimensions 

1188 beyond the dimensions of the dataset type itself, in order to find the 

1189 dataset with the appropriate validity range. For example, a "bias" 

1190 dataset with native dimensions ``{instrument, detector}`` could be 

1191 fetched with a ``{instrument, detector, exposure}`` data ID, because 

1192 ``exposure`` is a temporal dimension. 

1193 """ 

1194 _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

1195 ref = self._findDatasetRef( 

1196 datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs 

1197 ) 

1198 return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) 

1199 

1200 def getURIs( 

1201 self, 

1202 datasetRefOrType: DatasetRef | DatasetType | str, 

1203 /, 

1204 dataId: DataId | None = None, 

1205 *, 

1206 predict: bool = False, 

1207 collections: Any = None, 

1208 run: str | None = None, 

1209 **kwargs: Any, 

1210 ) -> DatasetRefURIs: 

1211 """Return the URIs associated with the dataset. 

1212 

1213 Parameters 

1214 ---------- 

1215 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1216 When `DatasetRef` the `dataId` should be `None`. 

1217 Otherwise the `DatasetType` or name thereof. 

1218 dataId : `dict` or `DataCoordinate` 

1219 A `dict` of `Dimension` link name, value pairs that label the 

1220 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1221 should be provided as the first argument. 

1222 predict : `bool` 

1223 If `True`, allow URIs to be returned of datasets that have not 

1224 been written. 

1225 collections : Any, optional 

1226 Collections to be searched, overriding ``self.collections``. 

1227 Can be any of the types supported by the ``collections`` argument 

1228 to butler construction. 

1229 run : `str`, optional 

1230 Run to use for predictions, overriding ``self.run``. 

1231 **kwargs 

1232 Additional keyword arguments used to augment or construct a 

1233 `DataCoordinate`. See `DataCoordinate.standardize` 

1234 parameters. 

1235 

1236 Returns 

1237 ------- 

1238 uris : `DatasetRefURIs` 

1239 The URI to the primary artifact associated with this dataset (if 

1240 the dataset was disassembled within the datastore this may be 

1241 `None`), and the URIs to any components associated with the dataset 

1242 artifact. (can be empty if there are no components). 

1243 """ 

1244 ref = self._findDatasetRef( 

1245 datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs 

1246 ) 

1247 return self._datastore.getURIs(ref, predict) 

1248 

1249 def getURI( 

1250 self, 

1251 datasetRefOrType: DatasetRef | DatasetType | str, 

1252 /, 

1253 dataId: DataId | None = None, 

1254 *, 

1255 predict: bool = False, 

1256 collections: Any = None, 

1257 run: str | None = None, 

1258 **kwargs: Any, 

1259 ) -> ResourcePath: 

1260 """Return the URI to the Dataset. 

1261 

1262 Parameters 

1263 ---------- 

1264 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1265 When `DatasetRef` the `dataId` should be `None`. 

1266 Otherwise the `DatasetType` or name thereof. 

1267 dataId : `dict` or `DataCoordinate` 

1268 A `dict` of `Dimension` link name, value pairs that label the 

1269 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1270 should be provided as the first argument. 

1271 predict : `bool` 

1272 If `True`, allow URIs to be returned of datasets that have not 

1273 been written. 

1274 collections : Any, optional 

1275 Collections to be searched, overriding ``self.collections``. 

1276 Can be any of the types supported by the ``collections`` argument 

1277 to butler construction. 

1278 run : `str`, optional 

1279 Run to use for predictions, overriding ``self.run``. 

1280 **kwargs 

1281 Additional keyword arguments used to augment or construct a 

1282 `DataCoordinate`. See `DataCoordinate.standardize` 

1283 parameters. 

1284 

1285 Returns 

1286 ------- 

1287 uri : `lsst.resources.ResourcePath` 

1288 URI pointing to the Dataset within the datastore. If the 

1289 Dataset does not exist in the datastore, and if ``predict`` is 

1290 `True`, the URI will be a prediction and will include a URI 

1291 fragment "#predicted". 

1292 If the datastore does not have entities that relate well 

1293 to the concept of a URI the returned URI string will be 

1294 descriptive. The returned URI is not guaranteed to be obtainable. 

1295 

1296 Raises 

1297 ------ 

1298 LookupError 

1299 A URI has been requested for a dataset that does not exist and 

1300 guessing is not allowed. 

1301 ValueError 

1302 Raised if a resolved `DatasetRef` was passed as an input, but it 

1303 differs from the one found in the registry. 

1304 TypeError 

1305 Raised if no collections were provided. 

1306 RuntimeError 

1307 Raised if a URI is requested for a dataset that consists of 

1308 multiple artifacts. 

1309 """ 

1310 primary, components = self.getURIs( 

1311 datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs 

1312 ) 

1313 

1314 if primary is None or components: 

1315 raise RuntimeError( 

1316 f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " 

1317 "Use Butler.getURIs() instead." 

1318 ) 

1319 return primary 

1320 

1321 def get_dataset_type(self, name: str) -> DatasetType: 

1322 return self._registry.getDatasetType(name) 

1323 

1324 def get_dataset( 

1325 self, 

1326 id: DatasetId, 

1327 storage_class: str | StorageClass | None = None, 

1328 dimension_records: bool = False, 

1329 datastore_records: bool = False, 

1330 ) -> DatasetRef | None: 

1331 ref = self._registry.getDataset(id) 

1332 if ref is not None: 

1333 if dimension_records: 

1334 ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions)) 

1335 if storage_class: 

1336 ref = ref.overrideStorageClass(storage_class) 

1337 if datastore_records: 

1338 ref = self._registry.get_datastore_records(ref) 

1339 return ref 

1340 

1341 def find_dataset( 

1342 self, 

1343 dataset_type: DatasetType | str, 

1344 data_id: DataId | None = None, 

1345 *, 

1346 collections: str | Sequence[str] | None = None, 

1347 timespan: Timespan | None = None, 

1348 storage_class: str | StorageClass | None = None, 

1349 dimension_records: bool = False, 

1350 datastore_records: bool = False, 

1351 **kwargs: Any, 

1352 ) -> DatasetRef | None: 

1353 # Handle any parts of the dataID that are not using primary dimension 

1354 # keys. 

1355 if isinstance(dataset_type, str): 

1356 actual_type = self.get_dataset_type(dataset_type) 

1357 else: 

1358 actual_type = dataset_type 

1359 data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs) 

1360 

1361 ref = self._registry.findDataset( 

1362 dataset_type, 

1363 data_id, 

1364 collections=collections, 

1365 timespan=timespan, 

1366 datastore_records=datastore_records, 

1367 **kwargs, 

1368 ) 

1369 if ref is not None and dimension_records: 

1370 ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions)) 

1371 if ref is not None and storage_class is not None: 

1372 ref = ref.overrideStorageClass(storage_class) 

1373 return ref 

1374 

1375 def retrieveArtifacts( 

1376 self, 

1377 refs: Iterable[DatasetRef], 

1378 destination: ResourcePathExpression, 

1379 transfer: str = "auto", 

1380 preserve_path: bool = True, 

1381 overwrite: bool = False, 

1382 ) -> list[ResourcePath]: 

1383 # Docstring inherited. 

1384 return self._datastore.retrieveArtifacts( 

1385 refs, 

1386 ResourcePath(destination), 

1387 transfer=transfer, 

1388 preserve_path=preserve_path, 

1389 overwrite=overwrite, 

1390 ) 

1391 

1392 def exists( 

1393 self, 

1394 dataset_ref_or_type: DatasetRef | DatasetType | str, 

1395 /, 

1396 data_id: DataId | None = None, 

1397 *, 

1398 full_check: bool = True, 

1399 collections: Any = None, 

1400 **kwargs: Any, 

1401 ) -> DatasetExistence: 

1402 # Docstring inherited. 

1403 existence = DatasetExistence.UNRECOGNIZED 

1404 

1405 if isinstance(dataset_ref_or_type, DatasetRef): 

1406 if collections is not None: 

1407 warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) 

1408 if data_id is not None: 

1409 warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) 

1410 ref = dataset_ref_or_type 

1411 registry_ref = self._registry.getDataset(dataset_ref_or_type.id) 

1412 if registry_ref is not None: 

1413 existence |= DatasetExistence.RECORDED 

1414 

1415 if dataset_ref_or_type != registry_ref: 

1416 # This could mean that storage classes differ, so we should 

1417 # check for that but use the registry ref for the rest of 

1418 # the method. 

1419 if registry_ref.is_compatible_with(dataset_ref_or_type): 

1420 # Use the registry version from now on. 

1421 ref = registry_ref 

1422 else: 

1423 raise ValueError( 

1424 f"The ref given to exists() ({ref}) has the same dataset ID as one " 

1425 f"in registry but has different incompatible values ({registry_ref})." 

1426 ) 

1427 else: 

1428 try: 

1429 ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) 

1430 except (LookupError, TypeError, NoDefaultCollectionError): 

1431 return existence 

1432 existence |= DatasetExistence.RECORDED 

1433 

1434 if self._datastore.knows(ref): 

1435 existence |= DatasetExistence.DATASTORE 

1436 

1437 if full_check: 

1438 if self._datastore.exists(ref): 

1439 existence |= DatasetExistence._ARTIFACT 

1440 elif existence.value != DatasetExistence.UNRECOGNIZED.value: 

1441 # Do not add this flag if we have no other idea about a dataset. 

1442 existence |= DatasetExistence(DatasetExistence._ASSUMED) 

1443 

1444 return existence 

1445 

1446 def _exists_many( 

1447 self, 

1448 refs: Iterable[DatasetRef], 

1449 /, 

1450 *, 

1451 full_check: bool = True, 

1452 ) -> dict[DatasetRef, DatasetExistence]: 

1453 # Docstring inherited. 

1454 existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} 

1455 

1456 # Registry does not have a bulk API to check for a ref. 

1457 for ref in refs: 

1458 registry_ref = self._registry.getDataset(ref.id) 

1459 if registry_ref is not None: 

1460 # It is possible, albeit unlikely, that the given ref does 

1461 # not match the one in registry even though the UUID matches. 

1462 # When checking a single ref we raise, but it's impolite to 

1463 # do that when potentially hundreds of refs are being checked. 

1464 # We could change the API to only accept UUIDs and that would 

1465 # remove the ability to even check and remove the worry 

1466 # about differing storage classes. Given the ongoing discussion 

1467 # on refs vs UUIDs and whether to raise or have a new 

1468 # private flag, treat this as a private API for now. 

1469 existence[ref] |= DatasetExistence.RECORDED 

1470 

1471 # Ask datastore if it knows about these refs. 

1472 knows = self._datastore.knows_these(refs) 

1473 for ref, known in knows.items(): 

1474 if known: 

1475 existence[ref] |= DatasetExistence.DATASTORE 

1476 

1477 if full_check: 

1478 mexists = self._datastore.mexists(refs) 

1479 for ref, exists in mexists.items(): 

1480 if exists: 

1481 existence[ref] |= DatasetExistence._ARTIFACT 

1482 else: 

1483 # Do not set this flag if nothing is known about the dataset. 

1484 for ref in existence: 

1485 if existence[ref] != DatasetExistence.UNRECOGNIZED: 

1486 existence[ref] |= DatasetExistence._ASSUMED 

1487 

1488 return existence 

1489 

1490 # TODO: remove on DM-40079. 

1491 @deprecated( 

1492 reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.", 

1493 version="v26.0", 

1494 category=FutureWarning, 

1495 ) 

1496 def datasetExists( 

1497 self, 

1498 datasetRefOrType: DatasetRef | DatasetType | str, 

1499 dataId: DataId | None = None, 

1500 *, 

1501 collections: Any = None, 

1502 **kwargs: Any, 

1503 ) -> bool: 

1504 """Return True if the Dataset is actually present in the Datastore. 

1505 

1506 Parameters 

1507 ---------- 

1508 datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

1509 When `DatasetRef` the `dataId` should be `None`. 

1510 Otherwise the `DatasetType` or name thereof. 

1511 dataId : `dict` or `DataCoordinate` 

1512 A `dict` of `Dimension` link name, value pairs that label the 

1513 `DatasetRef` within a Collection. When `None`, a `DatasetRef` 

1514 should be provided as the first argument. 

1515 collections : Any, optional 

1516 Collections to be searched, overriding ``self.collections``. 

1517 Can be any of the types supported by the ``collections`` argument 

1518 to butler construction. 

1519 **kwargs 

1520 Additional keyword arguments used to augment or construct a 

1521 `DataCoordinate`. See `DataCoordinate.standardize` 

1522 parameters. 

1523 

1524 Raises 

1525 ------ 

1526 LookupError 

1527 Raised if the dataset is not even present in the Registry. 

1528 ValueError 

1529 Raised if a resolved `DatasetRef` was passed as an input, but it 

1530 differs from the one found in the registry. 

1531 NoDefaultCollectionError 

1532 Raised if no collections were provided. 

1533 """ 

1534 # A resolved ref may be given that is not known to this butler. 

1535 if isinstance(datasetRefOrType, DatasetRef): 

1536 ref = self._registry.getDataset(datasetRefOrType.id) 

1537 if ref is None: 

1538 raise LookupError( 

1539 f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." 

1540 ) 

1541 else: 

1542 ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) 

1543 return self._datastore.exists(ref) 

1544 

1545 def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: 

1546 # Docstring inherited. 

1547 if not self.isWriteable(): 

1548 raise TypeError("Butler is read-only.") 

1549 names = list(names) 

1550 refs: list[DatasetRef] = [] 

1551 for name in names: 

1552 collectionType = self._registry.getCollectionType(name) 

1553 if collectionType is not CollectionType.RUN: 

1554 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") 

1555 refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) 

1556 with self._datastore.transaction(), self._registry.transaction(): 

1557 if unstore: 

1558 self._datastore.trash(refs) 

1559 else: 

1560 self._datastore.forget(refs) 

1561 for name in names: 

1562 self._registry.removeCollection(name) 

1563 if unstore: 

1564 # Point of no return for removing artifacts 

1565 self._datastore.emptyTrash() 

1566 

1567 def pruneDatasets( 

1568 self, 

1569 refs: Iterable[DatasetRef], 

1570 *, 

1571 disassociate: bool = True, 

1572 unstore: bool = False, 

1573 tags: Iterable[str] = (), 

1574 purge: bool = False, 

1575 ) -> None: 

1576 # docstring inherited from LimitedButler 

1577 

1578 if not self.isWriteable(): 

1579 raise TypeError("Butler is read-only.") 

1580 if purge: 

1581 if not disassociate: 

1582 raise TypeError("Cannot pass purge=True without disassociate=True.") 

1583 if not unstore: 

1584 raise TypeError("Cannot pass purge=True without unstore=True.") 

1585 elif disassociate: 

1586 tags = tuple(tags) 

1587 if not tags: 

1588 raise TypeError("No tags provided but disassociate=True.") 

1589 for tag in tags: 

1590 collectionType = self._registry.getCollectionType(tag) 

1591 if collectionType is not CollectionType.TAGGED: 

1592 raise TypeError( 

1593 f"Cannot disassociate from collection '{tag}' " 

1594 f"of non-TAGGED type {collectionType.name}." 

1595 ) 

1596 # Transform possibly-single-pass iterable into something we can iterate 

1597 # over multiple times. 

1598 refs = list(refs) 

1599 # Pruning a component of a DatasetRef makes no sense since registry 

1600 # doesn't know about components and datastore might not store 

1601 # components in a separate file 

1602 for ref in refs: 

1603 if ref.datasetType.component(): 

1604 raise ValueError(f"Can not prune a component of a dataset (ref={ref})") 

1605 # We don't need an unreliable Datastore transaction for this, because 

1606 # we've been extra careful to ensure that Datastore.trash only involves 

1607 # mutating the Registry (it can _look_ at Datastore-specific things, 

1608 # but shouldn't change them), and hence all operations here are 

1609 # Registry operations. 

1610 with self._datastore.transaction(), self._registry.transaction(): 

1611 if unstore: 

1612 self._datastore.trash(refs) 

1613 if purge: 

1614 self._registry.removeDatasets(refs) 

1615 elif disassociate: 

1616 assert tags, "Guaranteed by earlier logic in this function." 

1617 for tag in tags: 

1618 self._registry.disassociate(tag, refs) 

1619 # We've exited the Registry transaction, and apparently committed. 

1620 # (if there was an exception, everything rolled back, and it's as if 

1621 # nothing happened - and we never get here). 

1622 # Datastore artifacts are not yet gone, but they're clearly marked 

1623 # as trash, so if we fail to delete now because of (e.g.) filesystem 

1624 # problems we can try again later, and if manual administrative 

1625 # intervention is required, it's pretty clear what that should entail: 

1626 # deleting everything on disk and in private Datastore tables that is 

1627 # in the dataset_location_trash table. 

1628 if unstore: 

1629 # Point of no return for removing artifacts 

1630 self._datastore.emptyTrash() 

1631 

1632 @transactional 

1633 def ingest( 

1634 self, 

1635 *datasets: FileDataset, 

1636 transfer: str | None = "auto", 

1637 run: str | None = None, 

1638 idGenerationMode: DatasetIdGenEnum | None = None, 

1639 record_validation_info: bool = True, 

1640 ) -> None: 

1641 # Docstring inherited. 

1642 if not self.isWriteable(): 

1643 raise TypeError("Butler is read-only.") 

1644 

1645 _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") 

1646 if not datasets: 

1647 return 

1648 

1649 if idGenerationMode is not None: 

1650 warnings.warn( 

1651 "The idGenerationMode parameter is no longer used and is ignored. " 

1652 " Will be removed after v26.0", 

1653 FutureWarning, 

1654 stacklevel=2, 

1655 ) 

1656 

1657 progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) 

1658 

1659 # We need to reorganize all the inputs so that they are grouped 

1660 # by dataset type and run. Multiple refs in a single FileDataset 

1661 # are required to share the run and dataset type. 

1662 GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] 

1663 groupedData: GroupedData = defaultdict(list) 

1664 

1665 # Track DataIDs that are being ingested so we can spot issues early 

1666 # with duplication. Retain previous FileDataset so we can report it. 

1667 groupedDataIds: MutableMapping[ 

1668 tuple[DatasetType, str], dict[DataCoordinate, FileDataset] 

1669 ] = defaultdict(dict) 

1670 

1671 used_run = False 

1672 

1673 # And the nested loop that populates it: 

1674 for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): 

1675 # Somewhere to store pre-existing refs if we have an 

1676 # execution butler. 

1677 existingRefs: list[DatasetRef] = [] 

1678 

1679 for ref in dataset.refs: 

1680 assert ref.run is not None # For mypy 

1681 group_key = (ref.datasetType, ref.run) 

1682 

1683 if ref.dataId in groupedDataIds[group_key]: 

1684 raise ConflictingDefinitionError( 

1685 f"Ingest conflict. Dataset {dataset.path} has same" 

1686 " DataId as other ingest dataset" 

1687 f" {groupedDataIds[group_key][ref.dataId].path} " 

1688 f" ({ref.dataId})" 

1689 ) 

1690 

1691 groupedDataIds[group_key][ref.dataId] = dataset 

1692 

1693 if existingRefs: 

1694 if len(dataset.refs) != len(existingRefs): 

1695 # Keeping track of partially pre-existing datasets is hard 

1696 # and should generally never happen. For now don't allow 

1697 # it. 

1698 raise ConflictingDefinitionError( 

1699 f"For dataset {dataset.path} some dataIds already exist" 

1700 " in registry but others do not. This is not supported." 

1701 ) 

1702 

1703 # Store expanded form in the original FileDataset. 

1704 dataset.refs = existingRefs 

1705 else: 

1706 groupedData[group_key].append(dataset) 

1707 

1708 if not used_run and run is not None: 

1709 warnings.warn( 

1710 "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " 

1711 f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", 

1712 category=FutureWarning, 

1713 stacklevel=3, # Take into account the @transactional decorator. 

1714 ) 

1715 

1716 # Now we can bulk-insert into Registry for each DatasetType. 

1717 for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( 

1718 groupedData.items(), desc="Bulk-inserting datasets by type" 

1719 ): 

1720 refs_to_import = [] 

1721 for dataset in grouped_datasets: 

1722 refs_to_import.extend(dataset.refs) 

1723 

1724 n_refs = len(refs_to_import) 

1725 _LOG.verbose( 

1726 "Importing %d ref%s of dataset type %r into run %r", 

1727 n_refs, 

1728 "" if n_refs == 1 else "s", 

1729 datasetType.name, 

1730 this_run, 

1731 ) 

1732 

1733 # Import the refs and expand the DataCoordinates since we can't 

1734 # guarantee that they are expanded and Datastore will need 

1735 # the records. 

1736 imported_refs = self._registry._importDatasets(refs_to_import, expand=True) 

1737 assert set(imported_refs) == set(refs_to_import) 

1738 

1739 # Replace all the refs in the FileDataset with expanded versions. 

1740 # Pull them off in the order we put them on the list. 

1741 for dataset in grouped_datasets: 

1742 n_dataset_refs = len(dataset.refs) 

1743 dataset.refs = imported_refs[:n_dataset_refs] 

1744 del imported_refs[:n_dataset_refs] 

1745 

1746 # Bulk-insert everything into Datastore. 

1747 # We do not know if any of the registry entries already existed 

1748 # (_importDatasets only complains if they exist but differ) so 

1749 # we have to catch IntegrityError explicitly. 

1750 try: 

1751 self._datastore.ingest( 

1752 *datasets, transfer=transfer, record_validation_info=record_validation_info 

1753 ) 

1754 except IntegrityError as e: 

1755 raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e 

1756 

1757 @contextlib.contextmanager 

1758 def export( 

1759 self, 

1760 *, 

1761 directory: str | None = None, 

1762 filename: str | None = None, 

1763 format: str | None = None, 

1764 transfer: str | None = None, 

1765 ) -> Iterator[RepoExportContext]: 

1766 # Docstring inherited. 

1767 if directory is None and transfer is not None: 

1768 raise TypeError("Cannot transfer without providing a directory.") 

1769 if transfer == "move": 

1770 raise TypeError("Transfer may not be 'move': export is read-only") 

1771 if format is None: 

1772 if filename is None: 

1773 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1774 else: 

1775 _, format = os.path.splitext(filename) 

1776 if not format: 

1777 raise ValueError("Please specify a file extension to determine export format.") 

1778 format = format[1:] # Strip leading "."" 

1779 elif filename is None: 

1780 filename = f"export.{format}" 

1781 if directory is not None: 

1782 filename = os.path.join(directory, filename) 

1783 formats = self._config["repo_transfer_formats"] 

1784 if format not in formats: 

1785 raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") 

1786 BackendClass = get_class_of(formats[format, "export"]) 

1787 with open(filename, "w") as stream: 

1788 backend = BackendClass(stream, universe=self.dimensions) 

1789 try: 

1790 helper = RepoExportContext( 

1791 self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer 

1792 ) 

1793 yield helper 

1794 except BaseException: 

1795 raise 

1796 else: 

1797 helper._finish() 

1798 

1799 def import_( 

1800 self, 

1801 *, 

1802 directory: ResourcePathExpression | None = None, 

1803 filename: ResourcePathExpression | TextIO | None = None, 

1804 format: str | None = None, 

1805 transfer: str | None = None, 

1806 skip_dimensions: set | None = None, 

1807 ) -> None: 

1808 # Docstring inherited. 

1809 if not self.isWriteable(): 

1810 raise TypeError("Butler is read-only.") 

1811 if format is None: 

1812 if filename is None: 

1813 raise TypeError("At least one of 'filename' or 'format' must be provided.") 

1814 else: 

1815 _, format = os.path.splitext(filename) # type: ignore 

1816 elif filename is None: 

1817 filename = ResourcePath(f"export.{format}", forceAbsolute=False) 

1818 if directory is not None: 

1819 directory = ResourcePath(directory, forceDirectory=True) 

1820 # mypy doesn't think this will work but it does in python >= 3.10. 

1821 if isinstance(filename, ResourcePathExpression): # type: ignore 

1822 filename = ResourcePath(filename, forceAbsolute=False) # type: ignore 

1823 if not filename.isabs() and directory is not None: 

1824 potential = directory.join(filename) 

1825 exists_in_cwd = filename.exists() 

1826 exists_in_dir = potential.exists() 

1827 if exists_in_cwd and exists_in_dir: 

1828 _LOG.warning( 

1829 "A relative path for filename was specified (%s) which exists relative to cwd. " 

1830 "Additionally, the file exists relative to the given search directory (%s). " 

1831 "Using the export file in the given directory.", 

1832 filename, 

1833 potential, 

1834 ) 

1835 # Given they specified an explicit directory and that 

1836 # directory has the export file in it, assume that that 

1837 # is what was meant despite the file in cwd. 

1838 filename = potential 

1839 elif exists_in_dir: 

1840 filename = potential 

1841 elif not exists_in_cwd and not exists_in_dir: 

1842 # Raise early. 

1843 raise FileNotFoundError( 

1844 f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." 

1845 ) 

1846 BackendClass: type[RepoImportBackend] = get_class_of( 

1847 self._config["repo_transfer_formats"][format]["import"] 

1848 ) 

1849 

1850 def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: 

1851 backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] 

1852 backend.register() 

1853 with self.transaction(): 

1854 backend.load( 

1855 self._datastore, 

1856 directory=directory, 

1857 transfer=transfer, 

1858 skip_dimensions=skip_dimensions, 

1859 ) 

1860 

1861 if isinstance(filename, ResourcePath): 

1862 # We can not use open() here at the moment because of 

1863 # DM-38589 since yaml does stream.read(8192) in a loop. 

1864 stream = io.StringIO(filename.read().decode()) 

1865 doImport(stream) 

1866 else: 

1867 doImport(filename) # type: ignore 

1868 

1869 def transfer_from( 

1870 self, 

1871 source_butler: LimitedButler, 

1872 source_refs: Iterable[DatasetRef], 

1873 transfer: str = "auto", 

1874 skip_missing: bool = True, 

1875 register_dataset_types: bool = False, 

1876 transfer_dimensions: bool = False, 

1877 ) -> collections.abc.Collection[DatasetRef]: 

1878 # Docstring inherited. 

1879 if not self.isWriteable(): 

1880 raise TypeError("Butler is read-only.") 

1881 progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) 

1882 

1883 # Will iterate through the refs multiple times so need to convert 

1884 # to a list if this isn't a collection. 

1885 if not isinstance(source_refs, collections.abc.Collection): 

1886 source_refs = list(source_refs) 

1887 

1888 original_count = len(source_refs) 

1889 _LOG.info("Transferring %d datasets into %s", original_count, str(self)) 

1890 

1891 # In some situations the datastore artifact may be missing 

1892 # and we do not want that registry entry to be imported. 

1893 # Asking datastore is not sufficient, the records may have been 

1894 # purged, we have to ask for the (predicted) URI and check 

1895 # existence explicitly. Execution butler is set up exactly like 

1896 # this with no datastore records. 

1897 artifact_existence: dict[ResourcePath, bool] = {} 

1898 if skip_missing: 

1899 dataset_existence = source_butler._datastore.mexists( 

1900 source_refs, artifact_existence=artifact_existence 

1901 ) 

1902 source_refs = [ref for ref, exists in dataset_existence.items() if exists] 

1903 filtered_count = len(source_refs) 

1904 n_missing = original_count - filtered_count 

1905 _LOG.verbose( 

1906 "%d dataset%s removed because the artifact does not exist. Now have %d.", 

1907 n_missing, 

1908 "" if n_missing == 1 else "s", 

1909 filtered_count, 

1910 ) 

1911 

1912 # Importing requires that we group the refs by dataset type and run 

1913 # before doing the import. 

1914 source_dataset_types = set() 

1915 grouped_refs = defaultdict(list) 

1916 for ref in source_refs: 

1917 grouped_refs[ref.datasetType, ref.run].append(ref) 

1918 source_dataset_types.add(ref.datasetType) 

1919 

1920 # Check to see if the dataset type in the source butler has 

1921 # the same definition in the target butler and register missing 

1922 # ones if requested. Registration must happen outside a transaction. 

1923 newly_registered_dataset_types = set() 

1924 for datasetType in source_dataset_types: 

1925 if register_dataset_types: 

1926 # Let this raise immediately if inconsistent. Continuing 

1927 # on to find additional inconsistent dataset types 

1928 # might result in additional unwanted dataset types being 

1929 # registered. 

1930 if self._registry.registerDatasetType(datasetType): 

1931 newly_registered_dataset_types.add(datasetType) 

1932 else: 

1933 # If the dataset type is missing, let it fail immediately. 

1934 target_dataset_type = self.get_dataset_type(datasetType.name) 

1935 if target_dataset_type != datasetType: 

1936 raise ConflictingDefinitionError( 

1937 "Source butler dataset type differs from definition" 

1938 f" in target butler: {datasetType} !=" 

1939 f" {target_dataset_type}" 

1940 ) 

1941 if newly_registered_dataset_types: 

1942 # We may have registered some even if there were inconsistencies 

1943 # but should let people know (or else remove them again). 

1944 _LOG.verbose( 

1945 "Registered the following dataset types in the target Butler: %s", 

1946 ", ".join(d.name for d in newly_registered_dataset_types), 

1947 ) 

1948 else: 

1949 _LOG.verbose("All required dataset types are known to the target Butler") 

1950 

1951 dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) 

1952 if transfer_dimensions: 

1953 # Collect all the dimension records for these refs. 

1954 # All dimensions are to be copied but the list of valid dimensions 

1955 # come from this butler's universe. 

1956 elements = frozenset( 

1957 element 

1958 for element in self.dimensions.getStaticElements() 

1959 if element.hasTable() and element.viewOf is None 

1960 ) 

1961 dataIds = {ref.dataId for ref in source_refs} 

1962 # This logic comes from saveDataIds. 

1963 for dataId in dataIds: 

1964 # Need an expanded record, if not expanded that we need a full 

1965 # butler with registry (allow mocks with registry too). 

1966 if not dataId.hasRecords(): 

1967 if registry := getattr(source_butler, "registry", None): 

1968 dataId = registry.expandDataId(dataId) 

1969 else: 

1970 raise TypeError("Input butler needs to be a full butler to expand DataId.") 

1971 # If this butler doesn't know about a dimension in the source 

1972 # butler things will break later. 

1973 for record in dataId.records.values(): 

1974 if record is not None and record.definition in elements: 

1975 dimension_records[record.definition].setdefault(record.dataId, record) 

1976 

1977 handled_collections: set[str] = set() 

1978 

1979 # Do all the importing in a single transaction. 

1980 with self.transaction(): 

1981 if dimension_records: 

1982 _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") 

1983 for element, r in dimension_records.items(): 

1984 records = [r[dataId] for dataId in r] 

1985 # Assume that if the record is already present that we can 

1986 # use it without having to check that the record metadata 

1987 # is consistent. 

1988 self._registry.insertDimensionData(element, *records, skip_existing=True) 

1989 

1990 n_imported = 0 

1991 for (datasetType, run), refs_to_import in progress.iter_item_chunks( 

1992 grouped_refs.items(), desc="Importing to registry by run and dataset type" 

1993 ): 

1994 if run not in handled_collections: 

1995 # May need to create output collection. If source butler 

1996 # has a registry, ask for documentation string. 

1997 run_doc = None 

1998 if registry := getattr(source_butler, "registry", None): 

1999 run_doc = registry.getCollectionDocumentation(run) 

2000 registered = self._registry.registerRun(run, doc=run_doc) 

2001 handled_collections.add(run) 

2002 if registered: 

2003 _LOG.verbose("Creating output run %s", run) 

2004 

2005 n_refs = len(refs_to_import) 

2006 _LOG.verbose( 

2007 "Importing %d ref%s of dataset type %s into run %s", 

2008 n_refs, 

2009 "" if n_refs == 1 else "s", 

2010 datasetType.name, 

2011 run, 

2012 ) 

2013 

2014 # Assume we are using UUIDs and the source refs will match 

2015 # those imported. 

2016 imported_refs = self._registry._importDatasets(refs_to_import) 

2017 assert set(imported_refs) == set(refs_to_import) 

2018 n_imported += len(imported_refs) 

2019 

2020 assert len(source_refs) == n_imported 

2021 _LOG.verbose("Imported %d datasets into destination butler", n_imported) 

2022 

2023 # Ask the datastore to transfer. The datastore has to check that 

2024 # the source datastore is compatible with the target datastore. 

2025 accepted, rejected = self._datastore.transfer_from( 

2026 source_butler._datastore, 

2027 source_refs, 

2028 transfer=transfer, 

2029 artifact_existence=artifact_existence, 

2030 ) 

2031 if rejected: 

2032 # For now, accept the registry entries but not the files. 

2033 _LOG.warning( 

2034 "%d datasets were rejected and %d accepted for dataset type %s in run %r.", 

2035 len(rejected), 

2036 len(accepted), 

2037 datasetType, 

2038 run, 

2039 ) 

2040 

2041 return source_refs 

2042 

2043 def validateConfiguration( 

2044 self, 

2045 logFailures: bool = False, 

2046 datasetTypeNames: Iterable[str] | None = None, 

2047 ignore: Iterable[str] | None = None, 

2048 ) -> None: 

2049 # Docstring inherited. 

2050 if datasetTypeNames: 

2051 datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] 

2052 else: 

2053 datasetTypes = list(self._registry.queryDatasetTypes()) 

2054 

2055 # filter out anything from the ignore list 

2056 if ignore: 

2057 ignore = set(ignore) 

2058 datasetTypes = [ 

2059 e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore 

2060 ] 

2061 else: 

2062 ignore = set() 

2063 

2064 # For each datasetType that has an instrument dimension, create 

2065 # a DatasetRef for each defined instrument 

2066 datasetRefs = [] 

2067 

2068 # Find all the registered instruments (if "instrument" is in the 

2069 # universe). 

2070 if "instrument" in self.dimensions: 

2071 instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} 

2072 

2073 for datasetType in datasetTypes: 

2074 if "instrument" in datasetType.dimensions: 

2075 # In order to create a conforming dataset ref, create 

2076 # fake DataCoordinate values for the non-instrument 

2077 # dimensions. The type of the value does not matter here. 

2078 dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"} 

2079 

2080 for instrument in instruments: 

2081 datasetRef = DatasetRef( 

2082 datasetType, 

2083 DataCoordinate.standardize( 

2084 dataId, instrument=instrument, graph=datasetType.dimensions 

2085 ), 

2086 run="validate", 

2087 ) 

2088 datasetRefs.append(datasetRef) 

2089 

2090 entities: list[DatasetType | DatasetRef] = [] 

2091 entities.extend(datasetTypes) 

2092 entities.extend(datasetRefs) 

2093 

2094 datastoreErrorStr = None 

2095 try: 

2096 self._datastore.validateConfiguration(entities, logFailures=logFailures) 

2097 except ValidationError as e: 

2098 datastoreErrorStr = str(e) 

2099 

2100 # Also check that the LookupKeys used by the datastores match 

2101 # registry and storage class definitions 

2102 keys = self._datastore.getLookupKeys() 

2103 

2104 failedNames = set() 

2105 failedDataId = set() 

2106 for key in keys: 

2107 if key.name is not None: 

2108 if key.name in ignore: 

2109 continue 

2110 

2111 # skip if specific datasetType names were requested and this 

2112 # name does not match 

2113 if datasetTypeNames and key.name not in datasetTypeNames: 

2114 continue 

2115 

2116 # See if it is a StorageClass or a DatasetType 

2117 if key.name in self.storageClasses: 

2118 pass 

2119 else: 

2120 try: 

2121 self.get_dataset_type(key.name) 

2122 except KeyError: 

2123 if logFailures: 

2124 _LOG.critical( 

2125 "Key '%s' does not correspond to a DatasetType or StorageClass", key 

2126 ) 

2127 failedNames.add(key) 

2128 else: 

2129 # Dimensions are checked for consistency when the Butler 

2130 # is created and rendezvoused with a universe. 

2131 pass 

2132 

2133 # Check that the instrument is a valid instrument 

2134 # Currently only support instrument so check for that 

2135 if key.dataId: 

2136 dataIdKeys = set(key.dataId) 

2137 if {"instrument"} != dataIdKeys: 

2138 if logFailures: 

2139 _LOG.critical("Key '%s' has unsupported DataId override", key) 

2140 failedDataId.add(key) 

2141 elif key.dataId["instrument"] not in instruments: 

2142 if logFailures: 

2143 _LOG.critical("Key '%s' has unknown instrument", key) 

2144 failedDataId.add(key) 

2145 

2146 messages = [] 

2147 

2148 if datastoreErrorStr: 

2149 messages.append(datastoreErrorStr) 

2150 

2151 for failed, msg in ( 

2152 (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), 

2153 (failedDataId, "Keys with bad DataId entries: "), 

2154 ): 

2155 if failed: 

2156 msg += ", ".join(str(k) for k in failed) 

2157 messages.append(msg) 

2158 

2159 if messages: 

2160 raise ValidationError(";\n".join(messages)) 

2161 

2162 @property 

2163 def collections(self) -> Sequence[str]: 

2164 """The collections to search by default, in order 

2165 (`~collections.abc.Sequence` [ `str` ]). 

2166 

2167 This is an alias for ``self.registry.defaults.collections``. It cannot 

2168 be set directly in isolation, but all defaults may be changed together 

2169 by assigning a new `RegistryDefaults` instance to 

2170 ``self.registry.defaults``. 

2171 """ 

2172 return self._registry.defaults.collections 

2173 

2174 @property 

2175 def run(self) -> str | None: 

2176 """Name of the run this butler writes outputs to by default (`str` or 

2177 `None`). 

2178 

2179 This is an alias for ``self.registry.defaults.run``. It cannot be set 

2180 directly in isolation, but all defaults may be changed together by 

2181 assigning a new `RegistryDefaults` instance to 

2182 ``self.registry.defaults``. 

2183 """ 

2184 return self._registry.defaults.run 

2185 

2186 @property 

2187 def registry(self) -> Registry: 

2188 """The object that manages dataset metadata and relationships 

2189 (`Registry`). 

2190 

2191 Many operations that don't involve reading or writing butler datasets 

2192 are accessible only via `Registry` methods. Eventually these methods 

2193 will be replaced by equivalent `Butler` methods. 

2194 """ 

2195 return self._registry_shim 

2196 

2197 @property 

2198 def dimensions(self) -> DimensionUniverse: 

2199 # Docstring inherited. 

2200 return self._registry.dimensions 

2201 

2202 _registry: SqlRegistry 

2203 """The object that manages dataset metadata and relationships 

2204 (`SqlRegistry`). 

2205 

2206 Most operations that don't involve reading or writing butler datasets are 

2207 accessible only via `SqlRegistry` methods. 

2208 """ 

2209 

2210 datastore: Datastore 

2211 """The object that manages actual dataset storage (`Datastore`). 

2212 

2213 Direct user access to the datastore should rarely be necessary; the primary 

2214 exception is the case where a `Datastore` implementation provides extra 

2215 functionality beyond what the base class defines. 

2216 """ 

2217 

2218 storageClasses: StorageClassFactory 

2219 """An object that maps known storage class names to objects that fully 

2220 describe them (`StorageClassFactory`). 

2221 """