Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ConsistentDataIds", 

26 "Registry", 

27) 

28 

29from collections import defaultdict 

30import contextlib 

31from dataclasses import dataclass 

32import sys 

33from typing import ( 

34 Any, 

35 Iterable, 

36 Iterator, 

37 List, 

38 Mapping, 

39 Optional, 

40 Type, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47import lsst.sphgeom 

48from ..core import ( 

49 Config, 

50 DataCoordinate, 

51 DataId, 

52 DatasetRef, 

53 DatasetType, 

54 Dimension, 

55 DimensionElement, 

56 DimensionGraph, 

57 DimensionRecord, 

58 DimensionUniverse, 

59 ExpandedDataCoordinate, 

60 StorageClassFactory, 

61) 

62from ..core import ddl 

63from ..core.utils import doImport, iterable, transactional 

64from ._config import RegistryConfig 

65from .queries import ( 

66 QueryBuilder, 

67 QuerySummary, 

68) 

69from .tables import makeRegistryTableSpecs 

70from ._collectionType import CollectionType 

71from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

72from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch 

73 

74if TYPE_CHECKING: 74 ↛ 75line 74 didn't jump to line 75, because the condition on line 74 was never true

75 from ..butlerConfig import ButlerConfig 

76 from ..core import ( 

77 Quantum 

78 ) 

79 from .interfaces import ( 

80 CollectionManager, 

81 Database, 

82 OpaqueTableStorageManager, 

83 DimensionRecordStorageManager, 

84 DatasetRecordStorageManager, 

85 DatastoreRegistryBridgeManager, 

86 ) 

87 

88 

89@dataclass 

90class ConsistentDataIds: 

91 """A struct used to report relationships between data IDs by 

92 `Registry.relateDataIds`. 

93 

94 If an instance of this class is returned (instead of `None`), the data IDs 

95 are "not inconsistent" - any keys they have in common have the same value, 

96 and any spatial or temporal relationships they have at least might involve 

97 an overlap. To capture this, any instance of `ConsistentDataIds` coerces 

98 to `True` in boolean contexts. 

99 """ 

100 

101 overlaps: bool 

102 """If `True`, the data IDs have at least one key in common, associated with 

103 the same value. 

104 

105 Note that data IDs are not inconsistent even if overlaps is `False` - they 

106 may simply have no keys in common, which means they cannot have 

107 inconsistent values for any keys. They may even be equal, in the case that 

108 both data IDs are empty. 

109 

110 This field does _not_ indicate whether a spatial or temporal overlap 

111 relationship exists. 

112 """ 

113 

114 contains: bool 

115 """If `True`, all keys in the first data ID are in the second, and are 

116 associated with the same values. 

117 

118 This includes case where the first data ID is empty. 

119 """ 

120 

121 within: bool 

122 """If `True`, all keys in the second data ID are in the first, and are 

123 associated with the same values. 

124 

125 This includes case where the second data ID is empty. 

126 """ 

127 

128 @property 

129 def equal(self) -> bool: 

130 """If `True`, the two data IDs are the same. 

131 

132 Data IDs are equal if they have both a `contains` and a `within` 

133 relationship. 

134 """ 

135 return self.contains and self.within 

136 

137 @property 

138 def disjoint(self) -> bool: 

139 """If `True`, the two data IDs have no keys in common. 

140 

141 This is simply the oppose of `overlaps`. Disjoint datasets are by 

142 definition not inconsistent. 

143 """ 

144 return not self.overlaps 

145 

146 def __bool__(self) -> bool: 

147 return True 

148 

149 

150class Registry: 

151 """Registry interface. 

152 

153 Parameters 

154 ---------- 

155 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

156 Registry configuration 

157 """ 

158 

159 defaultConfigFile = None 

160 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

161 absolute path. Can be None if no defaults specified. 

162 """ 

163 

164 @classmethod 

165 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

166 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

167 """Create `Registry` subclass instance from `config`. 

168 

169 Uses ``registry.cls`` from `config` to determine which subclass to 

170 instantiate. 

171 

172 Parameters 

173 ---------- 

174 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

175 Registry configuration 

176 create : `bool`, optional 

177 Assume empty Registry and create a new one. 

178 butlerRoot : `str`, optional 

179 Path to the repository root this `Registry` will manage. 

180 writeable : `bool`, optional 

181 If `True` (default) create a read-write connection to the database. 

182 

183 Returns 

184 ------- 

185 registry : `Registry` (subclass) 

186 A new `Registry` subclass instance. 

187 """ 

188 if not isinstance(config, RegistryConfig): 

189 if isinstance(config, str) or isinstance(config, Config): 

190 config = RegistryConfig(config) 

191 else: 

192 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

193 config.replaceRoot(butlerRoot) 

194 DatabaseClass = config.getDatabaseClass() 

195 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

196 namespace=config.get("namespace"), writeable=writeable) 

197 universe = DimensionUniverse(config) 

198 opaque = doImport(config["managers", "opaque"]) 

199 dimensions = doImport(config["managers", "dimensions"]) 

200 collections = doImport(config["managers", "collections"]) 

201 datasets = doImport(config["managers", "datasets"]) 

202 datastoreBridges = doImport(config["managers", "datastores"]) 

203 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

204 datasets=datasets, datastoreBridges=datastoreBridges, create=create) 

205 

206 def __init__(self, database: Database, universe: DimensionUniverse, *, 

207 opaque: Type[OpaqueTableStorageManager], 

208 dimensions: Type[DimensionRecordStorageManager], 

209 collections: Type[CollectionManager], 

210 datasets: Type[DatasetRecordStorageManager], 

211 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

212 create: bool = False): 

213 self._db = database 

214 self.storageClasses = StorageClassFactory() 

215 with self._db.declareStaticTables(create=create) as context: 

216 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

217 self._collections = collections.initialize(self._db, context) 

218 self._datasets = datasets.initialize(self._db, context, 

219 collections=self._collections, 

220 universe=self.dimensions) 

221 self._opaque = opaque.initialize(self._db, context) 

222 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

223 opaque=self._opaque, 

224 datasets=datasets, 

225 universe=self.dimensions) 

226 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, 

227 self._collections, 

228 self._datasets)) 

229 self._collections.refresh() 

230 self._datasets.refresh(universe=self._dimensions.universe) 

231 

232 def __str__(self) -> str: 

233 return str(self._db) 

234 

235 def __repr__(self) -> str: 

236 return f"Registry({self._db!r}, {self.dimensions!r})" 

237 

238 def isWriteable(self) -> bool: 

239 """Return `True` if this registry allows write operations, and `False` 

240 otherwise. 

241 """ 

242 return self._db.isWriteable() 

243 

244 @property 

245 def dimensions(self) -> DimensionUniverse: 

246 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

247 """ 

248 return self._dimensions.universe 

249 

250 @contextlib.contextmanager 

251 def transaction(self): 

252 """Return a context manager that represents a transaction. 

253 """ 

254 # TODO make savepoint=False the default. 

255 try: 

256 with self._db.transaction(): 

257 yield 

258 except BaseException: 

259 # TODO: this clears the caches sometimes when we wouldn't actually 

260 # need to. Can we avoid that? 

261 self._dimensions.clearCaches() 

262 raise 

263 

264 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec): 

265 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

266 other data repository client. 

267 

268 Opaque table records can be added via `insertOpaqueData`, retrieved via 

269 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

270 

271 Parameters 

272 ---------- 

273 tableName : `str` 

274 Logical name of the opaque table. This may differ from the 

275 actual name used in the database by a prefix and/or suffix. 

276 spec : `ddl.TableSpec` 

277 Specification for the table to be added. 

278 """ 

279 self._opaque.register(tableName, spec) 

280 

281 @transactional 

282 def insertOpaqueData(self, tableName: str, *data: dict): 

283 """Insert records into an opaque table. 

284 

285 Parameters 

286 ---------- 

287 tableName : `str` 

288 Logical name of the opaque table. Must match the name used in a 

289 previous call to `registerOpaqueTable`. 

290 data 

291 Each additional positional argument is a dictionary that represents 

292 a single row to be added. 

293 """ 

294 self._opaque[tableName].insert(*data) 

295 

296 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

297 """Retrieve records from an opaque table. 

298 

299 Parameters 

300 ---------- 

301 tableName : `str` 

302 Logical name of the opaque table. Must match the name used in a 

303 previous call to `registerOpaqueTable`. 

304 where 

305 Additional keyword arguments are interpreted as equality 

306 constraints that restrict the returned rows (combined with AND); 

307 keyword arguments are column names and values are the values they 

308 must have. 

309 

310 Yields 

311 ------ 

312 row : `dict` 

313 A dictionary representing a single result row. 

314 """ 

315 yield from self._opaque[tableName].fetch(**where) 

316 

317 @transactional 

318 def deleteOpaqueData(self, tableName: str, **where: Any): 

319 """Remove records from an opaque table. 

320 

321 Parameters 

322 ---------- 

323 tableName : `str` 

324 Logical name of the opaque table. Must match the name used in a 

325 previous call to `registerOpaqueTable`. 

326 where 

327 Additional keyword arguments are interpreted as equality 

328 constraints that restrict the deleted rows (combined with AND); 

329 keyword arguments are column names and values are the values they 

330 must have. 

331 """ 

332 self._opaque[tableName].delete(**where) 

333 

334 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED): 

335 """Add a new collection if one with the given name does not exist. 

336 

337 Parameters 

338 ---------- 

339 name : `str` 

340 The name of the collection to create. 

341 type : `CollectionType` 

342 Enum value indicating the type of collection to create. 

343 

344 Notes 

345 ----- 

346 This method cannot be called within transactions, as it needs to be 

347 able to perform its own transaction to be concurrent. 

348 """ 

349 self._collections.register(name, type) 

350 

351 def getCollectionType(self, name: str) -> CollectionType: 

352 """Return an enumeration value indicating the type of the given 

353 collection. 

354 

355 Parameters 

356 ---------- 

357 name : `str` 

358 The name of the collection. 

359 

360 Returns 

361 ------- 

362 type : `CollectionType` 

363 Enum value indicating the type of this collection. 

364 

365 Raises 

366 ------ 

367 MissingCollectionError 

368 Raised if no collection with the given name exists. 

369 """ 

370 return self._collections.find(name).type 

371 

372 def registerRun(self, name: str): 

373 """Add a new run if one with the given name does not exist. 

374 

375 Parameters 

376 ---------- 

377 name : `str` 

378 The name of the run to create. 

379 

380 Notes 

381 ----- 

382 This method cannot be called within transactions, as it needs to be 

383 able to perform its own transaction to be concurrent. 

384 """ 

385 self._collections.register(name, CollectionType.RUN) 

386 

387 @transactional 

388 def removeCollection(self, name: str): 

389 """Completely remove the given collection. 

390 

391 Parameters 

392 ---------- 

393 name : `str` 

394 The name of the collection to remove. 

395 

396 Raises 

397 ------ 

398 MissingCollectionError 

399 Raised if no collection with the given name exists. 

400 

401 Notes 

402 ----- 

403 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

404 in it are also fully removed. This requires that those datasets be 

405 removed (or at least trashed) from any datastores that hold them first. 

406 

407 A collection may not be deleted as long as it is referenced by a 

408 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

409 be deleted or redefined first. 

410 """ 

411 self._collections.remove(name) 

412 

413 def getCollectionChain(self, parent: str) -> CollectionSearch: 

414 """Return the child collections in a `~CollectionType.CHAINED` 

415 collection. 

416 

417 Parameters 

418 ---------- 

419 parent : `str` 

420 Name of the chained collection. Must have already been added via 

421 a call to `Registry.registerCollection`. 

422 

423 Returns 

424 ------- 

425 children : `CollectionSearch` 

426 An object that defines the search path of the collection. 

427 See :ref:`daf_butler_collection_expressions` for more information. 

428 

429 Raises 

430 ------ 

431 MissingCollectionError 

432 Raised if ``parent`` does not exist in the `Registry`. 

433 TypeError 

434 Raised if ``parent`` does not correspond to a 

435 `~CollectionType.CHAINED` collection. 

436 """ 

437 record = self._collections.find(parent) 

438 if record.type is not CollectionType.CHAINED: 

439 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

440 return record.children 

441 

442 @transactional 

443 def setCollectionChain(self, parent: str, children: Any): 

444 """Define or redefine a `~CollectionType.CHAINED` collection. 

445 

446 Parameters 

447 ---------- 

448 parent : `str` 

449 Name of the chained collection. Must have already been added via 

450 a call to `Registry.registerCollection`. 

451 children : `Any` 

452 An expression defining an ordered search of child collections, 

453 generally an iterable of `str`. Restrictions on the dataset types 

454 to be searched can also be included, by passing mapping or an 

455 iterable containing tuples; see 

456 :ref:`daf_butler_collection_expressions` for more information. 

457 

458 Raises 

459 ------ 

460 MissingCollectionError 

461 Raised when any of the given collections do not exist in the 

462 `Registry`. 

463 TypeError 

464 Raised if ``parent`` does not correspond to a 

465 `~CollectionType.CHAINED` collection. 

466 ValueError 

467 Raised if the given collections contains a cycle. 

468 """ 

469 record = self._collections.find(parent) 

470 if record.type is not CollectionType.CHAINED: 

471 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

472 children = CollectionSearch.fromExpression(children) 

473 if children != record.children: 

474 record.update(self._collections, children) 

475 

476 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

477 """ 

478 Add a new `DatasetType` to the Registry. 

479 

480 It is not an error to register the same `DatasetType` twice. 

481 

482 Parameters 

483 ---------- 

484 datasetType : `DatasetType` 

485 The `DatasetType` to be added. 

486 

487 Returns 

488 ------- 

489 inserted : `bool` 

490 `True` if ``datasetType`` was inserted, `False` if an identical 

491 existing `DatsetType` was found. Note that in either case the 

492 DatasetType is guaranteed to be defined in the Registry 

493 consistently with the given definition. 

494 

495 Raises 

496 ------ 

497 ValueError 

498 Raised if the dimensions or storage class are invalid. 

499 ConflictingDefinitionError 

500 Raised if this DatasetType is already registered with a different 

501 definition. 

502 

503 Notes 

504 ----- 

505 This method cannot be called within transactions, as it needs to be 

506 able to perform its own transaction to be concurrent. 

507 """ 

508 _, inserted = self._datasets.register(datasetType) 

509 return inserted 

510 

511 def getDatasetType(self, name: str) -> DatasetType: 

512 """Get the `DatasetType`. 

513 

514 Parameters 

515 ---------- 

516 name : `str` 

517 Name of the type. 

518 

519 Returns 

520 ------- 

521 type : `DatasetType` 

522 The `DatasetType` associated with the given name. 

523 

524 Raises 

525 ------ 

526 KeyError 

527 Requested named DatasetType could not be found in registry. 

528 """ 

529 storage = self._datasets.find(name) 

530 if storage is None: 

531 raise KeyError(f"DatasetType '{name}' could not be found.") 

532 return storage.datasetType 

533 

534 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

535 collections: Any, **kwargs: Any) -> Optional[DatasetRef]: 

536 """Find a dataset given its `DatasetType` and data ID. 

537 

538 This can be used to obtain a `DatasetRef` that permits the dataset to 

539 be read from a `Datastore`. If the dataset is a component and can not 

540 be found using the provided dataset type, a dataset ref for the parent 

541 will be returned instead but with the correct dataset type. 

542 

543 Parameters 

544 ---------- 

545 datasetType : `DatasetType` or `str` 

546 A `DatasetType` or the name of one. 

547 dataId : `dict` or `DataCoordinate`, optional 

548 A `dict`-like object containing the `Dimension` links that identify 

549 the dataset within a collection. 

550 collections 

551 An expression that fully or partially identifies the collections 

552 to search for the dataset, such as a `str`, `re.Pattern`, or 

553 iterable thereof. `...` can be used to return all collections. 

554 See :ref:`daf_butler_collection_expressions` for more information. 

555 **kwargs 

556 Additional keyword arguments passed to 

557 `DataCoordinate.standardize` to convert ``dataId`` to a true 

558 `DataCoordinate` or augment an existing one. 

559 

560 Returns 

561 ------- 

562 ref : `DatasetRef` 

563 A reference to the dataset, or `None` if no matching Dataset 

564 was found. 

565 

566 Raises 

567 ------ 

568 LookupError 

569 Raised if one or more data ID keys are missing or the dataset type 

570 does not exist. 

571 MissingCollectionError 

572 Raised if any of ``collections`` does not exist in the registry. 

573 """ 

574 if isinstance(datasetType, DatasetType): 

575 storage = self._datasets.find(datasetType.name) 

576 if storage is None: 

577 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

578 else: 

579 storage = self._datasets.find(datasetType) 

580 if storage is None: 

581 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

582 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

583 universe=self.dimensions, **kwargs) 

584 collections = CollectionSearch.fromExpression(collections) 

585 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

586 result = storage.find(collectionRecord, dataId) 

587 if result is not None: 

588 if result.datasetType.isComposite(): 

589 result = self._datasets.fetchComponents(result) 

590 return result 

591 

592 # fallback to the parent if we got nothing and this was a component 

593 if storage.datasetType.isComponent(): 

594 parentType, _ = storage.datasetType.nameAndComponent() 

595 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs) 

596 if parentRef is not None: 

597 # Should already conform and we know no components 

598 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id, 

599 run=parentRef.run, conform=False, hasParentId=True) 

600 

601 return None 

602 

603 @transactional 

604 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

605 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False 

606 ) -> List[DatasetRef]: 

607 """Insert one or more datasets into the `Registry` 

608 

609 This always adds new datasets; to associate existing datasets with 

610 a new collection, use ``associate``. 

611 

612 Parameters 

613 ---------- 

614 datasetType : `DatasetType` or `str` 

615 A `DatasetType` or the name of one. 

616 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

617 Dimension-based identifiers for the new datasets. 

618 run : `str` 

619 The name of the run that produced the datasets. 

620 producer : `Quantum` 

621 Unit of work that produced the datasets. May be `None` to store 

622 no provenance information, but if present the `Quantum` must 

623 already have been added to the Registry. 

624 recursive : `bool` 

625 If True, recursively add datasets and attach entries for component 

626 datasets as well. 

627 

628 Returns 

629 ------- 

630 refs : `list` of `DatasetRef` 

631 Resolved `DatasetRef` instances for all given data IDs (in the same 

632 order). 

633 

634 Raises 

635 ------ 

636 ConflictingDefinitionError 

637 If a dataset with the same dataset type and data ID as one of those 

638 given already exists in ``run``. 

639 MissingCollectionError 

640 Raised if ``run`` does not exist in the registry. 

641 """ 

642 if isinstance(datasetType, DatasetType): 

643 storage = self._datasets.find(datasetType.name) 

644 if storage is None: 

645 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

646 else: 

647 storage = self._datasets.find(datasetType) 

648 if storage is None: 

649 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

650 runRecord = self._collections.find(run) 

651 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds] 

652 try: 

653 refs = list(storage.insert(runRecord, dataIds, quantum=producer)) 

654 except sqlalchemy.exc.IntegrityError as err: 

655 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

656 f"one or more datasets of type {storage.datasetType} into " 

657 f"collection '{run}'. " 

658 f"This probably means a dataset with the same data ID " 

659 f"and dataset type already exists, but it may also mean a " 

660 f"dimension row is missing.") from err 

661 if recursive and storage.datasetType.isComposite(): 

662 # Insert component rows by recursing. 

663 composites = defaultdict(dict) 

664 # TODO: we really shouldn't be inserting all components defined by 

665 # the storage class, because there's no guarantee all of them are 

666 # actually present in these datasets. 

667 for componentName in storage.datasetType.storageClass.components: 

668 componentDatasetType = storage.datasetType.makeComponentDatasetType(componentName) 

669 componentRefs = self.insertDatasets(componentDatasetType, 

670 dataIds=dataIds, 

671 run=run, 

672 producer=producer, 

673 recursive=True) 

674 for parentRef, componentRef in zip(refs, componentRefs): 

675 composites[parentRef][componentName] = componentRef 

676 if composites: 

677 refs = list(self._datasets.attachComponents(composites.items())) 

678 return refs 

679 

680 def getDataset(self, id: int) -> Optional[DatasetRef]: 

681 """Retrieve a Dataset entry. 

682 

683 Parameters 

684 ---------- 

685 id : `int` 

686 The unique identifier for the dataset. 

687 

688 Returns 

689 ------- 

690 ref : `DatasetRef` or `None` 

691 A ref to the Dataset, or `None` if no matching Dataset 

692 was found. 

693 """ 

694 ref = self._datasets.getDatasetRef(id) 

695 if ref is None: 

696 return None 

697 if ref.datasetType.isComposite(): 

698 return self._datasets.fetchComponents(ref) 

699 return ref 

700 

701 @transactional 

702 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True): 

703 """Remove datasets from the Registry. 

704 

705 The datasets will be removed unconditionally from all collections, and 

706 any `Quantum` that consumed this dataset will instead be marked with 

707 having a NULL input. `Datastore` records will *not* be deleted; the 

708 caller is responsible for ensuring that the dataset has already been 

709 removed from all Datastores. 

710 

711 Parameters 

712 ---------- 

713 refs : `Iterable` of `DatasetRef` 

714 References to the datasets to be removed. Must include a valid 

715 ``id`` attribute, and should be considered invalidated upon return. 

716 recursive : `bool`, optional 

717 If `True`, remove all component datasets as well. Note that 

718 this only removes components that are actually included in the 

719 given `DatasetRef` instances, which may not be the same as those in 

720 the database (especially if they were obtained from 

721 `queryDatasets`, which does not populate `DatasetRef.components`). 

722 

723 Raises 

724 ------ 

725 AmbiguousDatasetError 

726 Raised if any ``ref.id`` is `None`. 

727 OrphanedRecordError 

728 Raised if any dataset is still present in any `Datastore`. 

729 """ 

730 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

731 storage = self._datasets.find(datasetType.name) 

732 try: 

733 storage.delete(refsForType) 

734 except sqlalchemy.exc.IntegrityError as err: 

735 raise OrphanedRecordError("One or more datasets is still " 

736 "present in one or more Datastores.") from err 

737 

738 @transactional 

739 def attachComponents(self, parent: DatasetRef, components: Mapping[str, DatasetRef]): 

740 """Attach components to a dataset. 

741 

742 Parameters 

743 ---------- 

744 parent : `DatasetRef` 

745 A reference to the parent dataset. 

746 components : `Mapping` [ `str`, `DatasetRef` ] 

747 Mapping from component name to the `DatasetRef` for that component. 

748 

749 Returns 

750 ------- 

751 ref : `DatasetRef` 

752 An updated version of ``parent`` with components included. 

753 

754 Raises 

755 ------ 

756 AmbiguousDatasetError 

757 Raised if ``parent.id`` or any `DatasetRef.id` in ``components`` 

758 is `None`. 

759 """ 

760 for name, ref in components.items(): 

761 if ref.datasetType.storageClass != parent.datasetType.storageClass.components[name]: 

762 raise TypeError(f"Expected storage class " 

763 f"'{parent.datasetType.storageClass.components[name].name}' " 

764 f"for component '{name}' of dataset {parent}; got " 

765 f"dataset {ref} with storage class " 

766 f"'{ref.datasetType.storageClass.name}'.") 

767 ref, = self._datasets.attachComponents([(parent, components)]) 

768 return ref 

769 

770 @transactional 

771 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

772 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

773 

774 If a DatasetRef with the same exact integer ID is already in a 

775 collection nothing is changed. If a `DatasetRef` with the same 

776 `DatasetType` and data ID but with different integer ID 

777 exists in the collection, `ConflictingDefinitionError` is raised. 

778 

779 Parameters 

780 ---------- 

781 collection : `str` 

782 Indicates the collection the datasets should be associated with. 

783 refs : `Iterable` [ `DatasetRef` ] 

784 An iterable of resolved `DatasetRef` instances that already exist 

785 in this `Registry`. 

786 recursive : `bool`, optional 

787 If `True`, associate all component datasets as well. Note that 

788 this only associates components that are actually included in the 

789 given `DatasetRef` instances, which may not be the same as those in 

790 the database (especially if they were obtained from 

791 `queryDatasets`, which does not populate `DatasetRef.components`). 

792 

793 Raises 

794 ------ 

795 ConflictingDefinitionError 

796 If a Dataset with the given `DatasetRef` already exists in the 

797 given collection. 

798 AmbiguousDatasetError 

799 Raised if ``any(ref.id is None for ref in refs)``. 

800 MissingCollectionError 

801 Raised if ``collection`` does not exist in the registry. 

802 TypeError 

803 Raise adding new datasets to the given ``collection`` is not 

804 allowed. 

805 """ 

806 collectionRecord = self._collections.find(collection) 

807 if collectionRecord.type is not CollectionType.TAGGED: 

808 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

809 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

810 storage = self._datasets.find(datasetType.name) 

811 try: 

812 storage.associate(collectionRecord, refsForType) 

813 except sqlalchemy.exc.IntegrityError as err: 

814 raise ConflictingDefinitionError( 

815 f"Constraint violation while associating dataset of type {datasetType.name} with " 

816 f"collection {collection}. This probably means that one or more datasets with the same " 

817 f"dataset type and data ID already exist in the collection, but it may also indicate " 

818 f"that the datasets do not exist." 

819 ) from err 

820 

821 @transactional 

822 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

823 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

824 

825 ``collection`` and ``ref`` combinations that are not currently 

826 associated are silently ignored. 

827 

828 Parameters 

829 ---------- 

830 collection : `str` 

831 The collection the datasets should no longer be associated with. 

832 refs : `Iterable` [ `DatasetRef` ] 

833 An iterable of resolved `DatasetRef` instances that already exist 

834 in this `Registry`. 

835 recursive : `bool`, optional 

836 If `True`, disassociate all component datasets as well. Note that 

837 this only disassociates components that are actually included in 

838 the given `DatasetRef` instances, which may not be the same as 

839 those in the database (especially if they were obtained from 

840 `queryDatasets`, which does not populate `DatasetRef.components`). 

841 

842 Raises 

843 ------ 

844 AmbiguousDatasetError 

845 Raised if any of the given dataset references is unresolved. 

846 MissingCollectionError 

847 Raised if ``collection`` does not exist in the registry. 

848 TypeError 

849 Raise adding new datasets to the given ``collection`` is not 

850 allowed. 

851 """ 

852 collectionRecord = self._collections.find(collection) 

853 if collectionRecord.type is not CollectionType.TAGGED: 

854 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

855 "expected TAGGED.") 

856 for datasetType, refsForType in DatasetRef.groupByType(refs, recursive=recursive).items(): 

857 storage = self._datasets.find(datasetType.name) 

858 storage.disassociate(collectionRecord, refsForType) 

859 

860 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

861 # TODO docs 

862 return self._datastoreBridges 

863 

864 def getDatasetLocations(self, ref: DatasetRef) -> Iterator[str]: 

865 """Retrieve datastore locations for a given dataset. 

866 

867 Typically used by `Datastore`. 

868 

869 Parameters 

870 ---------- 

871 ref : `DatasetRef` 

872 A reference to the dataset for which to retrieve storage 

873 information. 

874 

875 Returns 

876 ------- 

877 datastores : `Iterable` [ `str` ] 

878 All the matching datastores holding this dataset. 

879 

880 Raises 

881 ------ 

882 AmbiguousDatasetError 

883 Raised if ``ref.id`` is `None`. 

884 """ 

885 return self._datastoreBridges.findDatastores(ref) 

886 

887 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

888 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds): 

889 """Expand a dimension-based data ID to include additional information. 

890 

891 Parameters 

892 ---------- 

893 dataId : `DataCoordinate` or `dict`, optional 

894 Data ID to be expanded; augmented and overridden by ``kwds``. 

895 graph : `DimensionGraph`, optional 

896 Set of dimensions for the expanded ID. If `None`, the dimensions 

897 will be inferred from the keys of ``dataId`` and ``kwds``. 

898 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

899 are silently ignored, providing a way to extract and expand a 

900 subset of a data ID. 

901 records : mapping [`DimensionElement`, `DimensionRecord`], optional 

902 Dimension record data to use before querying the database for that 

903 data. 

904 **kwds 

905 Additional keywords are treated like additional key-value pairs for 

906 ``dataId``, extending and overriding 

907 

908 Returns 

909 ------- 

910 expanded : `ExpandedDataCoordinate` 

911 A data ID that includes full metadata for all of the dimensions it 

912 identifieds. 

913 """ 

914 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds) 

915 if isinstance(standardized, ExpandedDataCoordinate): 

916 return standardized 

917 elif isinstance(dataId, ExpandedDataCoordinate): 

918 records = dict(records) if records is not None else {} 

919 records.update(dataId.records) 

920 else: 

921 records = dict(records) if records is not None else {} 

922 keys = dict(standardized) 

923 regions = [] 

924 timespans = [] 

925 for element in standardized.graph.primaryKeyTraversalOrder: 

926 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

927 if record is ...: 

928 storage = self._dimensions[element] 

929 record = storage.fetch(keys) 

930 records[element] = record 

931 if record is not None: 

932 for d in element.implied: 

933 value = getattr(record, d.name) 

934 if keys.setdefault(d, value) != value: 

935 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, " 

936 f"but {element.name} implies {d.name}={value!r}.") 

937 if element in standardized.graph.spatial and record.region is not None: 

938 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions): 

939 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} " 

940 f"is disjoint with those for other elements.") 

941 regions.append(record.region) 

942 if element in standardized.graph.temporal: 

943 if any(not record.timespan.overlaps(t) for t in timespans): 

944 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}" 

945 f" is disjoint with those for other elements.") 

946 timespans.append(record.timespan) 

947 else: 

948 if element in standardized.graph.required: 

949 raise LookupError( 

950 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

951 ) 

952 if element.alwaysJoin: 

953 raise InconsistentDataIdError( 

954 f"Could not fetch record for element {element.name} via keys {keys}, ", 

955 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

956 "related." 

957 ) 

958 records.update((d, None) for d in element.implied) 

959 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

960 

961 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]: 

962 """Compare the keys and values of a pair of data IDs for consistency. 

963 

964 See `ConsistentDataIds` for more information. 

965 

966 Parameters 

967 ---------- 

968 a : `dict` or `DataCoordinate` 

969 First data ID to be compared. 

970 b : `dict` or `DataCoordinate` 

971 Second data ID to be compared. 

972 

973 Returns 

974 ------- 

975 relationship : `ConsistentDataIds` or `None` 

976 Relationship information. This is not `None` and coerces to 

977 `True` in boolean contexts if and only if the data IDs are 

978 consistent in terms of all common key-value pairs, all many-to-many 

979 join tables, and all spatial andtemporal relationships. 

980 """ 

981 a = DataCoordinate.standardize(a, universe=self.dimensions) 

982 b = DataCoordinate.standardize(b, universe=self.dimensions) 

983 aFull = getattr(a, "full", None) 

984 bFull = getattr(b, "full", None) 

985 aBest = aFull if aFull is not None else a 

986 bBest = bFull if bFull is not None else b 

987 jointKeys = aBest.keys() & bBest.keys() 

988 # If any common values are not equal, we know they are inconsistent. 

989 if any(aBest[k] != bBest[k] for k in jointKeys): 

990 return None 

991 # If the graphs are equal, we know the data IDs are. 

992 if a.graph == b.graph: 

993 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys)) 

994 # Result is still inconclusive. Try to expand a data ID containing 

995 # keys from both; that will fail if they are inconsistent. 

996 # First, if either input was already an ExpandedDataCoordinate, extract 

997 # its records so we don't have to query for them. 

998 records = {} 

999 if hasattr(a, "records"): 

1000 records.update(a.records) 

1001 if hasattr(b, "records"): 

1002 records.update(b.records) 

1003 try: 

1004 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records) 

1005 except InconsistentDataIdError: 

1006 return None 

1007 # We know the answer is not `None`; time to figure out what it is. 

1008 return ConsistentDataIds( 

1009 contains=(a.graph >= b.graph), 

1010 within=(a.graph <= b.graph), 

1011 overlaps=bool(a.graph & b.graph), 

1012 ) 

1013 

1014 def insertDimensionData(self, element: Union[DimensionElement, str], 

1015 *data: Union[dict, DimensionRecord], 

1016 conform: bool = True): 

1017 """Insert one or more dimension records into the database. 

1018 

1019 Parameters 

1020 ---------- 

1021 element : `DimensionElement` or `str` 

1022 The `DimensionElement` or name thereof that identifies the table 

1023 records will be inserted into. 

1024 data : `dict` or `DimensionRecord` (variadic) 

1025 One or more records to insert. 

1026 conform : `bool`, optional 

1027 If `False` (`True` is default) perform no checking or conversions, 

1028 and assume that ``element`` is a `DimensionElement` instance and 

1029 ``data`` is a one or more `DimensionRecord` instances of the 

1030 appropriate subclass. 

1031 """ 

1032 if conform: 

1033 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1034 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1035 for row in data] 

1036 else: 

1037 records = data 

1038 storage = self._dimensions[element] 

1039 storage.insert(*records) 

1040 

1041 def syncDimensionData(self, element: Union[DimensionElement, str], 

1042 row: Union[dict, DimensionRecord], 

1043 conform: bool = True) -> bool: 

1044 """Synchronize the given dimension record with the database, inserting 

1045 if it does not already exist and comparing values if it does. 

1046 

1047 Parameters 

1048 ---------- 

1049 element : `DimensionElement` or `str` 

1050 The `DimensionElement` or name thereof that identifies the table 

1051 records will be inserted into. 

1052 row : `dict` or `DimensionRecord` 

1053 The record to insert. 

1054 conform : `bool`, optional 

1055 If `False` (`True` is default) perform no checking or conversions, 

1056 and assume that ``element`` is a `DimensionElement` instance and 

1057 ``data`` is a one or more `DimensionRecord` instances of the 

1058 appropriate subclass. 

1059 

1060 Returns 

1061 ------- 

1062 inserted : `bool` 

1063 `True` if a new row was inserted, `False` otherwise. 

1064 

1065 Raises 

1066 ------ 

1067 ConflictingDefinitionError 

1068 Raised if the record exists in the database (according to primary 

1069 key lookup) but is inconsistent with the given one. 

1070 

1071 Notes 

1072 ----- 

1073 This method cannot be called within transactions, as it needs to be 

1074 able to perform its own transaction to be concurrent. 

1075 """ 

1076 if conform: 

1077 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1078 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1079 else: 

1080 record = row 

1081 storage = self._dimensions[element] 

1082 return storage.sync(record) 

1083 

1084 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]: 

1085 """Iterate over the dataset types whose names match an expression. 

1086 

1087 Parameters 

1088 ---------- 

1089 expression : `Any`, optional 

1090 An expression that fully or partially identifies the dataset types 

1091 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1092 `...` can be used to return all dataset types, and is the default. 

1093 See :ref:`daf_butler_dataset_type_expressions` for more 

1094 information. 

1095 

1096 Yields 

1097 ------ 

1098 datasetType : `DatasetType` 

1099 A `DatasetType` instance whose name matches ``expression``. 

1100 """ 

1101 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1102 if wildcard is ...: 

1103 yield from self._datasets 

1104 return 

1105 done = set() 

1106 for name in wildcard.strings: 

1107 storage = self._datasets.find(name) 

1108 if storage is not None: 

1109 done.add(storage.datasetType) 

1110 yield storage.datasetType 

1111 if wildcard.patterns: 

1112 for datasetType in self._datasets: 

1113 if datasetType.name in done: 

1114 continue 

1115 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1116 yield datasetType 

1117 

1118 def queryCollections(self, expression: Any = ..., 

1119 datasetType: Optional[DatasetType] = None, 

1120 collectionType: Optional[CollectionType] = None, 

1121 flattenChains: bool = False, 

1122 includeChains: Optional[bool] = None) -> Iterator[str]: 

1123 """Iterate over the collections whose names match an expression. 

1124 

1125 Parameters 

1126 ---------- 

1127 expression : `Any`, optional 

1128 An expression that fully or partially identifies the collections 

1129 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1130 `...` can be used to return all collections, and is the default. 

1131 See :ref:`daf_butler_collection_expressions` for more 

1132 information. 

1133 datasetType : `DatasetType`, optional 

1134 If provided, only yield collections that should be searched for 

1135 this dataset type according to ``expression``. If this is 

1136 not provided, any dataset type restrictions in ``expression`` are 

1137 ignored. 

1138 collectionType : `CollectionType`, optional 

1139 If provided, only yield collections of this type. 

1140 flattenChains : `bool`, optional 

1141 If `True` (`False` is default), recursively yield the child 

1142 collections of matching `~CollectionType.CHAINED` collections. 

1143 includeChains : `bool`, optional 

1144 If `True`, yield records for matching `~CollectionType.CHAINED` 

1145 collections. Default is the opposite of ``flattenChains``: include 

1146 either CHAINED collections or their children, but not both. 

1147 

1148 Yields 

1149 ------ 

1150 collection : `str` 

1151 The name of a collection that matches ``expression``. 

1152 """ 

1153 query = CollectionQuery.fromExpression(expression) 

1154 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1155 flattenChains=flattenChains, includeChains=includeChains): 

1156 yield record.name 

1157 

1158 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1159 """Return a `QueryBuilder` instance capable of constructing and 

1160 managing more complex queries than those obtainable via `Registry` 

1161 interfaces. 

1162 

1163 This is an advanced interface; downstream code should prefer 

1164 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1165 are sufficient. 

1166 

1167 Parameters 

1168 ---------- 

1169 summary : `QuerySummary` 

1170 Object describing and categorizing the full set of dimensions that 

1171 will be included in the query. 

1172 

1173 Returns 

1174 ------- 

1175 builder : `QueryBuilder` 

1176 Object that can be used to construct and perform advanced queries. 

1177 """ 

1178 return QueryBuilder(summary=summary, 

1179 collections=self._collections, 

1180 dimensions=self._dimensions, 

1181 datasets=self._datasets) 

1182 

1183 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1184 dataId: Optional[DataId] = None, 

1185 datasets: Any = None, 

1186 collections: Any = None, 

1187 where: Optional[str] = None, 

1188 expand: bool = True, 

1189 **kwds) -> Iterator[DataCoordinate]: 

1190 """Query for and iterate over data IDs matching user-provided criteria. 

1191 

1192 Parameters 

1193 ---------- 

1194 dimensions : `Dimension` or `str`, or iterable thereof 

1195 The dimensions of the data IDs to yield, as either `Dimension` 

1196 instances or `str`. Will be automatically expanded to a complete 

1197 `DimensionGraph`. 

1198 dataId : `dict` or `DataCoordinate`, optional 

1199 A data ID whose key-value pairs are used as equality constraints 

1200 in the query. 

1201 datasets : `Any`, optional 

1202 An expression that fully or partially identifies dataset types 

1203 that should constrain the yielded data IDs. For example, including 

1204 "raw" here would constrain the yielded ``instrument``, 

1205 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1206 those for which at least one "raw" dataset exists in 

1207 ``collections``. Allowed types include `DatasetType`, `str`, 

1208 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1209 expressions, `...` is not permitted - it doesn't make sense to 

1210 constrain data IDs on the existence of *all* datasets. 

1211 See :ref:`daf_butler_dataset_type_expressions` for more 

1212 information. 

1213 collections: `Any`, optional 

1214 An expression that fully or partially identifies the collections 

1215 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1216 thereof. `...` can be used to return all collections. Must be 

1217 provided if ``datasets`` is, and is ignored if it is not. See 

1218 :ref:`daf_butler_collection_expressions` for more information. 

1219 where : `str`, optional 

1220 A string expression similar to a SQL WHERE clause. May involve 

1221 any column of a dimension table or (as a shortcut for the primary 

1222 key column of a dimension table) dimension name. See 

1223 :ref:`daf_butler_dimension_expressions` for more information. 

1224 expand : `bool`, optional 

1225 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1226 minimal `DataCoordinate` base-class instances. 

1227 kwds 

1228 Additional keyword arguments are forwarded to 

1229 `DataCoordinate.standardize` when processing the ``dataId`` 

1230 argument (and may be used to provide a constraining data ID even 

1231 when the ``dataId`` argument is `None`). 

1232 

1233 Yields 

1234 ------ 

1235 dataId : `DataCoordinate` 

1236 Data IDs matching the given query parameters. Order is 

1237 unspecified. 

1238 """ 

1239 dimensions = iterable(dimensions) 

1240 standardizedDataId = self.expandDataId(dataId, **kwds) 

1241 standardizedDatasetTypes = [] 

1242 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1243 if datasets is not None: 

1244 if collections is None: 

1245 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1246 for datasetType in self.queryDatasetTypes(datasets): 

1247 requestedDimensionNames.update(datasetType.dimensions.names) 

1248 standardizedDatasetTypes.append(datasetType) 

1249 # Preprocess collections expression in case the original included 

1250 # single-pass iterators (we'll want to use it multiple times 

1251 # below). 

1252 collections = CollectionQuery.fromExpression(collections) 

1253 

1254 summary = QuerySummary( 

1255 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1256 dataId=standardizedDataId, 

1257 expression=where, 

1258 ) 

1259 builder = self.makeQueryBuilder(summary) 

1260 for datasetType in standardizedDatasetTypes: 

1261 builder.joinDataset(datasetType, collections, isResult=False) 

1262 query = builder.finish() 

1263 predicate = query.predicate() 

1264 for row in self._db.query(query.sql): 

1265 if predicate(row): 

1266 result = query.extractDataId(row) 

1267 if expand: 

1268 yield self.expandDataId(result, records=standardizedDataId.records) 

1269 else: 

1270 yield result 

1271 

1272 def queryDatasets(self, datasetType: Any, *, 

1273 collections: Any, 

1274 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1275 dataId: Optional[DataId] = None, 

1276 where: Optional[str] = None, 

1277 deduplicate: bool = False, 

1278 expand: bool = True, 

1279 **kwds) -> Iterator[DatasetRef]: 

1280 """Query for and iterate over dataset references matching user-provided 

1281 criteria. 

1282 

1283 Parameters 

1284 ---------- 

1285 datasetType 

1286 An expression that fully or partially identifies the dataset types 

1287 to be queried. Allowed types include `DatasetType`, `str`, 

1288 `re.Pattern`, and iterables thereof. The special value `...` can 

1289 be used to query all dataset types. See 

1290 :ref:`daf_butler_dataset_type_expressions` for more information. 

1291 collections 

1292 An expression that fully or partially identifies the collections 

1293 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1294 thereof. `...` can be used to return all collections. See 

1295 :ref:`daf_butler_collection_expressions` for more information. 

1296 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1297 Dimensions to include in the query (in addition to those used 

1298 to identify the queried dataset type(s)), either to constrain 

1299 the resulting datasets to those for which a matching dimension 

1300 exists, or to relate the dataset type's dimensions to dimensions 

1301 referenced by the ``dataId`` or ``where`` arguments. 

1302 dataId : `dict` or `DataCoordinate`, optional 

1303 A data ID whose key-value pairs are used as equality constraints 

1304 in the query. 

1305 where : `str`, optional 

1306 A string expression similar to a SQL WHERE clause. May involve 

1307 any column of a dimension table or (as a shortcut for the primary 

1308 key column of a dimension table) dimension name. See 

1309 :ref:`daf_butler_dimension_expressions` for more information. 

1310 deduplicate : `bool`, optional 

1311 If `True` (`False` is default), for each result data ID, only 

1312 yield one `DatasetRef` of each `DatasetType`, from the first 

1313 collection in which a dataset of that dataset type appears 

1314 (according to the order of ``collections`` passed in). If `True`, 

1315 ``collections`` must not contain regular expressions and may not 

1316 be `...`. 

1317 expand : `bool`, optional 

1318 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1319 minimal `DataCoordinate` base-class instances. 

1320 kwds 

1321 Additional keyword arguments are forwarded to 

1322 `DataCoordinate.standardize` when processing the ``dataId`` 

1323 argument (and may be used to provide a constraining data ID even 

1324 when the ``dataId`` argument is `None`). 

1325 

1326 Yields 

1327 ------ 

1328 ref : `DatasetRef` 

1329 Dataset references matching the given query criteria. These 

1330 are grouped by `DatasetType` if the query evaluates to multiple 

1331 dataset types, but order is otherwise unspecified. 

1332 

1333 Raises 

1334 ------ 

1335 TypeError 

1336 Raised when the arguments are incompatible, such as when a 

1337 collection wildcard is passed when ``deduplicate`` is `True`. 

1338 

1339 Notes 

1340 ----- 

1341 When multiple dataset types are queried in a single call, the 

1342 results of this operation are equivalent to querying for each dataset 

1343 type separately in turn, and no information about the relationships 

1344 between datasets of different types is included. In contexts where 

1345 that kind of information is important, the recommended pattern is to 

1346 use `queryDimensions` to first obtain data IDs (possibly with the 

1347 desired dataset types and collections passed as constraints to the 

1348 query), and then use multiple (generally much simpler) calls to 

1349 `queryDatasets` with the returned data IDs passed as constraints. 

1350 """ 

1351 # Standardize the collections expression. 

1352 if deduplicate: 

1353 collections = CollectionSearch.fromExpression(collections) 

1354 else: 

1355 collections = CollectionQuery.fromExpression(collections) 

1356 # Standardize and expand the data ID provided as a constraint. 

1357 standardizedDataId = self.expandDataId(dataId, **kwds) 

1358 # If the datasetType passed isn't actually a DatasetType, expand it 

1359 # (it could be an expression that yields multiple DatasetTypes) and 

1360 # recurse. 

1361 if not isinstance(datasetType, DatasetType): 

1362 for trueDatasetType in self.queryDatasetTypes(datasetType): 

1363 yield from self.queryDatasets(trueDatasetType, collections=collections, 

1364 dimensions=dimensions, dataId=standardizedDataId, 

1365 where=where, deduplicate=deduplicate) 

1366 return 

1367 # The full set of dimensions in the query is the combination of those 

1368 # needed for the DatasetType and those explicitly requested, if any. 

1369 requestedDimensionNames = set(datasetType.dimensions.names) 

1370 if dimensions is not None: 

1371 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1372 # Construct the summary structure needed to construct a QueryBuilder. 

1373 summary = QuerySummary( 

1374 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1375 dataId=standardizedDataId, 

1376 expression=where, 

1377 ) 

1378 builder = self.makeQueryBuilder(summary) 

1379 # Add the dataset subquery to the query, telling the QueryBuilder to 

1380 # include the rank of the selected collection in the results only if we 

1381 # need to deduplicate. Note that if any of the collections are 

1382 # actually wildcard expressions, and we've asked for deduplication, 

1383 # this will raise TypeError for us. 

1384 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1385 return 

1386 query = builder.finish() 

1387 predicate = query.predicate() 

1388 if not deduplicate: 

1389 # No need to de-duplicate across collections. 

1390 for row in self._db.query(query.sql): 

1391 if predicate(row): 

1392 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1393 if expand: 

1394 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1395 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1396 else: 

1397 # For each data ID, yield only the DatasetRef with the lowest 

1398 # collection rank. 

1399 bestRefs = {} 

1400 bestRanks = {} 

1401 for row in self._db.query(query.sql): 

1402 if predicate(row): 

1403 ref, rank = query.extractDatasetRef(row, datasetType) 

1404 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1405 if rank < bestRank: 

1406 bestRefs[ref.dataId] = ref 

1407 bestRanks[ref.dataId] = rank 

1408 # If caller requested expanded data IDs, we defer that until here 

1409 # so we do as little expansion as possible. 

1410 if expand: 

1411 for ref in bestRefs.values(): 

1412 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1413 yield ref.expanded(dataId) 

1414 else: 

1415 yield from bestRefs.values() 

1416 

1417 dimensions: DimensionUniverse 

1418 """The universe of all dimensions known to the registry 

1419 (`DimensionUniverse`). 

1420 """ 

1421 

1422 storageClasses: StorageClassFactory 

1423 """All storage classes known to the registry (`StorageClassFactory`). 

1424 """