Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ConsistentDataIds", 

26 "Registry", 

27) 

28 

29from collections import defaultdict 

30import contextlib 

31from dataclasses import dataclass 

32import sys 

33from typing import ( 

34 Any, 

35 Dict, 

36 Iterable, 

37 Iterator, 

38 List, 

39 Mapping, 

40 Optional, 

41 Type, 

42 TYPE_CHECKING, 

43 Union, 

44) 

45 

46import sqlalchemy 

47 

48import lsst.sphgeom 

49from ..core import ( 

50 Config, 

51 DataCoordinate, 

52 DataId, 

53 DatasetRef, 

54 DatasetType, 

55 Dimension, 

56 DimensionElement, 

57 DimensionGraph, 

58 DimensionRecord, 

59 DimensionUniverse, 

60 ExpandedDataCoordinate, 

61 StorageClassFactory, 

62) 

63from ..core import ddl 

64from ..core.utils import doImport, iterable, transactional 

65from ._config import RegistryConfig 

66from .queries import ( 

67 QueryBuilder, 

68 QuerySummary, 

69) 

70from .tables import makeRegistryTableSpecs 

71from ._collectionType import CollectionType 

72from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

73from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch 

74 

75if TYPE_CHECKING: 75 ↛ 76line 75 didn't jump to line 76, because the condition on line 75 was never true

76 from ..butlerConfig import ButlerConfig 

77 from ..core import ( 

78 Quantum 

79 ) 

80 from .interfaces import ( 

81 CollectionManager, 

82 Database, 

83 OpaqueTableStorageManager, 

84 DimensionRecordStorageManager, 

85 DatasetRecordStorageManager, 

86 DatastoreRegistryBridgeManager, 

87 ) 

88 

89 

90@dataclass 

91class ConsistentDataIds: 

92 """A struct used to report relationships between data IDs by 

93 `Registry.relateDataIds`. 

94 

95 If an instance of this class is returned (instead of `None`), the data IDs 

96 are "not inconsistent" - any keys they have in common have the same value, 

97 and any spatial or temporal relationships they have at least might involve 

98 an overlap. To capture this, any instance of `ConsistentDataIds` coerces 

99 to `True` in boolean contexts. 

100 """ 

101 

102 overlaps: bool 

103 """If `True`, the data IDs have at least one key in common, associated with 

104 the same value. 

105 

106 Note that data IDs are not inconsistent even if overlaps is `False` - they 

107 may simply have no keys in common, which means they cannot have 

108 inconsistent values for any keys. They may even be equal, in the case that 

109 both data IDs are empty. 

110 

111 This field does _not_ indicate whether a spatial or temporal overlap 

112 relationship exists. 

113 """ 

114 

115 contains: bool 

116 """If `True`, all keys in the first data ID are in the second, and are 

117 associated with the same values. 

118 

119 This includes case where the first data ID is empty. 

120 """ 

121 

122 within: bool 

123 """If `True`, all keys in the second data ID are in the first, and are 

124 associated with the same values. 

125 

126 This includes case where the second data ID is empty. 

127 """ 

128 

129 @property 

130 def equal(self) -> bool: 

131 """If `True`, the two data IDs are the same. 

132 

133 Data IDs are equal if they have both a `contains` and a `within` 

134 relationship. 

135 """ 

136 return self.contains and self.within 

137 

138 @property 

139 def disjoint(self) -> bool: 

140 """If `True`, the two data IDs have no keys in common. 

141 

142 This is simply the oppose of `overlaps`. Disjoint datasets are by 

143 definition not inconsistent. 

144 """ 

145 return not self.overlaps 

146 

147 def __bool__(self) -> bool: 

148 return True 

149 

150 

151class Registry: 

152 """Registry interface. 

153 

154 Parameters 

155 ---------- 

156 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

157 Registry configuration 

158 """ 

159 

160 defaultConfigFile = None 

161 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

162 absolute path. Can be None if no defaults specified. 

163 """ 

164 

165 @classmethod 

166 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

167 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

168 """Create `Registry` subclass instance from `config`. 

169 

170 Uses ``registry.cls`` from `config` to determine which subclass to 

171 instantiate. 

172 

173 Parameters 

174 ---------- 

175 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

176 Registry configuration 

177 create : `bool`, optional 

178 Assume empty Registry and create a new one. 

179 butlerRoot : `str`, optional 

180 Path to the repository root this `Registry` will manage. 

181 writeable : `bool`, optional 

182 If `True` (default) create a read-write connection to the database. 

183 

184 Returns 

185 ------- 

186 registry : `Registry` (subclass) 

187 A new `Registry` subclass instance. 

188 """ 

189 if not isinstance(config, RegistryConfig): 

190 if isinstance(config, str) or isinstance(config, Config): 

191 config = RegistryConfig(config) 

192 else: 

193 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

194 config.replaceRoot(butlerRoot) 

195 DatabaseClass = config.getDatabaseClass() 

196 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

197 namespace=config.get("namespace"), writeable=writeable) 

198 universe = DimensionUniverse(config) 

199 opaque = doImport(config["managers", "opaque"]) 

200 dimensions = doImport(config["managers", "dimensions"]) 

201 collections = doImport(config["managers", "collections"]) 

202 datasets = doImport(config["managers", "datasets"]) 

203 datastoreBridges = doImport(config["managers", "datastores"]) 

204 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

205 datasets=datasets, datastoreBridges=datastoreBridges, create=create) 

206 

207 def __init__(self, database: Database, universe: DimensionUniverse, *, 

208 opaque: Type[OpaqueTableStorageManager], 

209 dimensions: Type[DimensionRecordStorageManager], 

210 collections: Type[CollectionManager], 

211 datasets: Type[DatasetRecordStorageManager], 

212 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

213 create: bool = False): 

214 self._db = database 

215 self.storageClasses = StorageClassFactory() 

216 with self._db.declareStaticTables(create=create) as context: 

217 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

218 self._collections = collections.initialize(self._db, context) 

219 self._datasets = datasets.initialize(self._db, context, 

220 collections=self._collections, 

221 universe=self.dimensions) 

222 self._opaque = opaque.initialize(self._db, context) 

223 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

224 opaque=self._opaque, 

225 datasets=datasets, 

226 universe=self.dimensions) 

227 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, 

228 self._collections, 

229 self._datasets)) 

230 self._collections.refresh() 

231 self._datasets.refresh(universe=self._dimensions.universe) 

232 

233 def __str__(self) -> str: 

234 return str(self._db) 

235 

236 def __repr__(self) -> str: 

237 return f"Registry({self._db!r}, {self.dimensions!r})" 

238 

239 def isWriteable(self) -> bool: 

240 """Return `True` if this registry allows write operations, and `False` 

241 otherwise. 

242 """ 

243 return self._db.isWriteable() 

244 

245 @property 

246 def dimensions(self) -> DimensionUniverse: 

247 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

248 """ 

249 return self._dimensions.universe 

250 

251 @contextlib.contextmanager 

252 def transaction(self): 

253 """Return a context manager that represents a transaction. 

254 """ 

255 # TODO make savepoint=False the default. 

256 try: 

257 with self._db.transaction(): 

258 yield 

259 except BaseException: 

260 # TODO: this clears the caches sometimes when we wouldn't actually 

261 # need to. Can we avoid that? 

262 self._dimensions.clearCaches() 

263 raise 

264 

265 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec): 

266 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

267 other data repository client. 

268 

269 Opaque table records can be added via `insertOpaqueData`, retrieved via 

270 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

271 

272 Parameters 

273 ---------- 

274 tableName : `str` 

275 Logical name of the opaque table. This may differ from the 

276 actual name used in the database by a prefix and/or suffix. 

277 spec : `ddl.TableSpec` 

278 Specification for the table to be added. 

279 """ 

280 self._opaque.register(tableName, spec) 

281 

282 @transactional 

283 def insertOpaqueData(self, tableName: str, *data: dict): 

284 """Insert records into an opaque table. 

285 

286 Parameters 

287 ---------- 

288 tableName : `str` 

289 Logical name of the opaque table. Must match the name used in a 

290 previous call to `registerOpaqueTable`. 

291 data 

292 Each additional positional argument is a dictionary that represents 

293 a single row to be added. 

294 """ 

295 self._opaque[tableName].insert(*data) 

296 

297 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

298 """Retrieve records from an opaque table. 

299 

300 Parameters 

301 ---------- 

302 tableName : `str` 

303 Logical name of the opaque table. Must match the name used in a 

304 previous call to `registerOpaqueTable`. 

305 where 

306 Additional keyword arguments are interpreted as equality 

307 constraints that restrict the returned rows (combined with AND); 

308 keyword arguments are column names and values are the values they 

309 must have. 

310 

311 Yields 

312 ------ 

313 row : `dict` 

314 A dictionary representing a single result row. 

315 """ 

316 yield from self._opaque[tableName].fetch(**where) 

317 

318 @transactional 

319 def deleteOpaqueData(self, tableName: str, **where: Any): 

320 """Remove records from an opaque table. 

321 

322 Parameters 

323 ---------- 

324 tableName : `str` 

325 Logical name of the opaque table. Must match the name used in a 

326 previous call to `registerOpaqueTable`. 

327 where 

328 Additional keyword arguments are interpreted as equality 

329 constraints that restrict the deleted rows (combined with AND); 

330 keyword arguments are column names and values are the values they 

331 must have. 

332 """ 

333 self._opaque[tableName].delete(**where) 

334 

335 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED): 

336 """Add a new collection if one with the given name does not exist. 

337 

338 Parameters 

339 ---------- 

340 name : `str` 

341 The name of the collection to create. 

342 type : `CollectionType` 

343 Enum value indicating the type of collection to create. 

344 

345 Notes 

346 ----- 

347 This method cannot be called within transactions, as it needs to be 

348 able to perform its own transaction to be concurrent. 

349 """ 

350 self._collections.register(name, type) 

351 

352 def getCollectionType(self, name: str) -> CollectionType: 

353 """Return an enumeration value indicating the type of the given 

354 collection. 

355 

356 Parameters 

357 ---------- 

358 name : `str` 

359 The name of the collection. 

360 

361 Returns 

362 ------- 

363 type : `CollectionType` 

364 Enum value indicating the type of this collection. 

365 

366 Raises 

367 ------ 

368 MissingCollectionError 

369 Raised if no collection with the given name exists. 

370 """ 

371 return self._collections.find(name).type 

372 

373 def registerRun(self, name: str): 

374 """Add a new run if one with the given name does not exist. 

375 

376 Parameters 

377 ---------- 

378 name : `str` 

379 The name of the run to create. 

380 

381 Notes 

382 ----- 

383 This method cannot be called within transactions, as it needs to be 

384 able to perform its own transaction to be concurrent. 

385 """ 

386 self._collections.register(name, CollectionType.RUN) 

387 

388 @transactional 

389 def removeCollection(self, name: str): 

390 """Completely remove the given collection. 

391 

392 Parameters 

393 ---------- 

394 name : `str` 

395 The name of the collection to remove. 

396 

397 Raises 

398 ------ 

399 MissingCollectionError 

400 Raised if no collection with the given name exists. 

401 

402 Notes 

403 ----- 

404 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

405 in it are also fully removed. This requires that those datasets be 

406 removed (or at least trashed) from any datastores that hold them first. 

407 

408 A collection may not be deleted as long as it is referenced by a 

409 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

410 be deleted or redefined first. 

411 """ 

412 self._collections.remove(name) 

413 

414 def getCollectionChain(self, parent: str) -> CollectionSearch: 

415 """Return the child collections in a `~CollectionType.CHAINED` 

416 collection. 

417 

418 Parameters 

419 ---------- 

420 parent : `str` 

421 Name of the chained collection. Must have already been added via 

422 a call to `Registry.registerCollection`. 

423 

424 Returns 

425 ------- 

426 children : `CollectionSearch` 

427 An object that defines the search path of the collection. 

428 See :ref:`daf_butler_collection_expressions` for more information. 

429 

430 Raises 

431 ------ 

432 MissingCollectionError 

433 Raised if ``parent`` does not exist in the `Registry`. 

434 TypeError 

435 Raised if ``parent`` does not correspond to a 

436 `~CollectionType.CHAINED` collection. 

437 """ 

438 record = self._collections.find(parent) 

439 if record.type is not CollectionType.CHAINED: 

440 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

441 return record.children 

442 

443 @transactional 

444 def setCollectionChain(self, parent: str, children: Any): 

445 """Define or redefine a `~CollectionType.CHAINED` collection. 

446 

447 Parameters 

448 ---------- 

449 parent : `str` 

450 Name of the chained collection. Must have already been added via 

451 a call to `Registry.registerCollection`. 

452 children : `Any` 

453 An expression defining an ordered search of child collections, 

454 generally an iterable of `str`. Restrictions on the dataset types 

455 to be searched can also be included, by passing mapping or an 

456 iterable containing tuples; see 

457 :ref:`daf_butler_collection_expressions` for more information. 

458 

459 Raises 

460 ------ 

461 MissingCollectionError 

462 Raised when any of the given collections do not exist in the 

463 `Registry`. 

464 TypeError 

465 Raised if ``parent`` does not correspond to a 

466 `~CollectionType.CHAINED` collection. 

467 ValueError 

468 Raised if the given collections contains a cycle. 

469 """ 

470 record = self._collections.find(parent) 

471 if record.type is not CollectionType.CHAINED: 

472 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

473 children = CollectionSearch.fromExpression(children) 

474 if children != record.children: 

475 record.update(self._collections, children) 

476 

477 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

478 """ 

479 Add a new `DatasetType` to the Registry. 

480 

481 It is not an error to register the same `DatasetType` twice. 

482 

483 Parameters 

484 ---------- 

485 datasetType : `DatasetType` 

486 The `DatasetType` to be added. 

487 

488 Returns 

489 ------- 

490 inserted : `bool` 

491 `True` if ``datasetType`` was inserted, `False` if an identical 

492 existing `DatsetType` was found. Note that in either case the 

493 DatasetType is guaranteed to be defined in the Registry 

494 consistently with the given definition. 

495 

496 Raises 

497 ------ 

498 ValueError 

499 Raised if the dimensions or storage class are invalid. 

500 ConflictingDefinitionError 

501 Raised if this DatasetType is already registered with a different 

502 definition. 

503 

504 Notes 

505 ----- 

506 This method cannot be called within transactions, as it needs to be 

507 able to perform its own transaction to be concurrent. 

508 """ 

509 _, inserted = self._datasets.register(datasetType) 

510 return inserted 

511 

512 def getDatasetType(self, name: str) -> DatasetType: 

513 """Get the `DatasetType`. 

514 

515 Parameters 

516 ---------- 

517 name : `str` 

518 Name of the type. 

519 

520 Returns 

521 ------- 

522 type : `DatasetType` 

523 The `DatasetType` associated with the given name. 

524 

525 Raises 

526 ------ 

527 KeyError 

528 Requested named DatasetType could not be found in registry. 

529 """ 

530 storage = self._datasets.find(name) 

531 if storage is None: 

532 raise KeyError(f"DatasetType '{name}' could not be found.") 

533 return storage.datasetType 

534 

535 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

536 collections: Any, **kwargs: Any) -> Optional[DatasetRef]: 

537 """Find a dataset given its `DatasetType` and data ID. 

538 

539 This can be used to obtain a `DatasetRef` that permits the dataset to 

540 be read from a `Datastore`. If the dataset is a component and can not 

541 be found using the provided dataset type, a dataset ref for the parent 

542 will be returned instead but with the correct dataset type. 

543 

544 Parameters 

545 ---------- 

546 datasetType : `DatasetType` or `str` 

547 A `DatasetType` or the name of one. 

548 dataId : `dict` or `DataCoordinate`, optional 

549 A `dict`-like object containing the `Dimension` links that identify 

550 the dataset within a collection. 

551 collections 

552 An expression that fully or partially identifies the collections 

553 to search for the dataset, such as a `str`, `re.Pattern`, or 

554 iterable thereof. `...` can be used to return all collections. 

555 See :ref:`daf_butler_collection_expressions` for more information. 

556 **kwargs 

557 Additional keyword arguments passed to 

558 `DataCoordinate.standardize` to convert ``dataId`` to a true 

559 `DataCoordinate` or augment an existing one. 

560 

561 Returns 

562 ------- 

563 ref : `DatasetRef` 

564 A reference to the dataset, or `None` if no matching Dataset 

565 was found. 

566 

567 Raises 

568 ------ 

569 LookupError 

570 Raised if one or more data ID keys are missing or the dataset type 

571 does not exist. 

572 MissingCollectionError 

573 Raised if any of ``collections`` does not exist in the registry. 

574 """ 

575 if isinstance(datasetType, DatasetType): 

576 storage = self._datasets.find(datasetType.name) 

577 if storage is None: 

578 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

579 else: 

580 storage = self._datasets.find(datasetType) 

581 if storage is None: 

582 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

583 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

584 universe=self.dimensions, **kwargs) 

585 collections = CollectionSearch.fromExpression(collections) 

586 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

587 result = storage.find(collectionRecord, dataId) 

588 if result is not None: 

589 return result 

590 

591 # fallback to the parent if we got nothing and this was a component 

592 if storage.datasetType.isComponent(): 

593 parentType, _ = storage.datasetType.nameAndComponent() 

594 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs) 

595 if parentRef is not None: 

596 # Should already conform and we know no components 

597 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id, 

598 run=parentRef.run, conform=False, hasParentId=True) 

599 

600 return None 

601 

602 @transactional 

603 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

604 run: str, *, producer: Optional[Quantum] = None) -> List[DatasetRef]: 

605 """Insert one or more datasets into the `Registry` 

606 

607 This always adds new datasets; to associate existing datasets with 

608 a new collection, use ``associate``. 

609 

610 Parameters 

611 ---------- 

612 datasetType : `DatasetType` or `str` 

613 A `DatasetType` or the name of one. 

614 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

615 Dimension-based identifiers for the new datasets. 

616 run : `str` 

617 The name of the run that produced the datasets. 

618 producer : `Quantum` 

619 Unit of work that produced the datasets. May be `None` to store 

620 no provenance information, but if present the `Quantum` must 

621 already have been added to the Registry. 

622 

623 Returns 

624 ------- 

625 refs : `list` of `DatasetRef` 

626 Resolved `DatasetRef` instances for all given data IDs (in the same 

627 order). 

628 

629 Raises 

630 ------ 

631 ConflictingDefinitionError 

632 If a dataset with the same dataset type and data ID as one of those 

633 given already exists in ``run``. 

634 MissingCollectionError 

635 Raised if ``run`` does not exist in the registry. 

636 """ 

637 if isinstance(datasetType, DatasetType): 

638 storage = self._datasets.find(datasetType.name) 

639 if storage is None: 

640 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

641 else: 

642 storage = self._datasets.find(datasetType) 

643 if storage is None: 

644 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

645 runRecord = self._collections.find(run) 

646 dataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) for dataId in dataIds] 

647 try: 

648 refs = list(storage.insert(runRecord, dataIds, quantum=producer)) 

649 except sqlalchemy.exc.IntegrityError as err: 

650 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

651 f"one or more datasets of type {storage.datasetType} into " 

652 f"collection '{run}'. " 

653 f"This probably means a dataset with the same data ID " 

654 f"and dataset type already exists, but it may also mean a " 

655 f"dimension row is missing.") from err 

656 return refs 

657 

658 def getDataset(self, id: int) -> Optional[DatasetRef]: 

659 """Retrieve a Dataset entry. 

660 

661 Parameters 

662 ---------- 

663 id : `int` 

664 The unique identifier for the dataset. 

665 

666 Returns 

667 ------- 

668 ref : `DatasetRef` or `None` 

669 A ref to the Dataset, or `None` if no matching Dataset 

670 was found. 

671 """ 

672 ref = self._datasets.getDatasetRef(id) 

673 if ref is None: 

674 return None 

675 return ref 

676 

677 @transactional 

678 def removeDatasets(self, refs: Iterable[DatasetRef]): 

679 """Remove datasets from the Registry. 

680 

681 The datasets will be removed unconditionally from all collections, and 

682 any `Quantum` that consumed this dataset will instead be marked with 

683 having a NULL input. `Datastore` records will *not* be deleted; the 

684 caller is responsible for ensuring that the dataset has already been 

685 removed from all Datastores. 

686 

687 Parameters 

688 ---------- 

689 refs : `Iterable` of `DatasetRef` 

690 References to the datasets to be removed. Must include a valid 

691 ``id`` attribute, and should be considered invalidated upon return. 

692 

693 Raises 

694 ------ 

695 AmbiguousDatasetError 

696 Raised if any ``ref.id`` is `None`. 

697 OrphanedRecordError 

698 Raised if any dataset is still present in any `Datastore`. 

699 """ 

700 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

701 storage = self._datasets.find(datasetType.name) 

702 try: 

703 storage.delete(refsForType) 

704 except sqlalchemy.exc.IntegrityError as err: 

705 raise OrphanedRecordError("One or more datasets is still " 

706 "present in one or more Datastores.") from err 

707 

708 @transactional 

709 def associate(self, collection: str, refs: Iterable[DatasetRef]): 

710 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

711 

712 If a DatasetRef with the same exact integer ID is already in a 

713 collection nothing is changed. If a `DatasetRef` with the same 

714 `DatasetType` and data ID but with different integer ID 

715 exists in the collection, `ConflictingDefinitionError` is raised. 

716 

717 Parameters 

718 ---------- 

719 collection : `str` 

720 Indicates the collection the datasets should be associated with. 

721 refs : `Iterable` [ `DatasetRef` ] 

722 An iterable of resolved `DatasetRef` instances that already exist 

723 in this `Registry`. 

724 

725 Raises 

726 ------ 

727 ConflictingDefinitionError 

728 If a Dataset with the given `DatasetRef` already exists in the 

729 given collection. 

730 AmbiguousDatasetError 

731 Raised if ``any(ref.id is None for ref in refs)``. 

732 MissingCollectionError 

733 Raised if ``collection`` does not exist in the registry. 

734 TypeError 

735 Raise adding new datasets to the given ``collection`` is not 

736 allowed. 

737 """ 

738 collectionRecord = self._collections.find(collection) 

739 if collectionRecord.type is not CollectionType.TAGGED: 

740 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

741 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

742 storage = self._datasets.find(datasetType.name) 

743 try: 

744 storage.associate(collectionRecord, refsForType) 

745 except sqlalchemy.exc.IntegrityError as err: 

746 raise ConflictingDefinitionError( 

747 f"Constraint violation while associating dataset of type {datasetType.name} with " 

748 f"collection {collection}. This probably means that one or more datasets with the same " 

749 f"dataset type and data ID already exist in the collection, but it may also indicate " 

750 f"that the datasets do not exist." 

751 ) from err 

752 

753 @transactional 

754 def disassociate(self, collection: str, refs: Iterable[DatasetRef]): 

755 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

756 

757 ``collection`` and ``ref`` combinations that are not currently 

758 associated are silently ignored. 

759 

760 Parameters 

761 ---------- 

762 collection : `str` 

763 The collection the datasets should no longer be associated with. 

764 refs : `Iterable` [ `DatasetRef` ] 

765 An iterable of resolved `DatasetRef` instances that already exist 

766 in this `Registry`. 

767 

768 Raises 

769 ------ 

770 AmbiguousDatasetError 

771 Raised if any of the given dataset references is unresolved. 

772 MissingCollectionError 

773 Raised if ``collection`` does not exist in the registry. 

774 TypeError 

775 Raise adding new datasets to the given ``collection`` is not 

776 allowed. 

777 """ 

778 collectionRecord = self._collections.find(collection) 

779 if collectionRecord.type is not CollectionType.TAGGED: 

780 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

781 "expected TAGGED.") 

782 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

783 storage = self._datasets.find(datasetType.name) 

784 storage.disassociate(collectionRecord, refsForType) 

785 

786 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

787 # TODO docs 

788 return self._datastoreBridges 

789 

790 def getDatasetLocations(self, ref: DatasetRef) -> Iterator[str]: 

791 """Retrieve datastore locations for a given dataset. 

792 

793 Typically used by `Datastore`. 

794 

795 Parameters 

796 ---------- 

797 ref : `DatasetRef` 

798 A reference to the dataset for which to retrieve storage 

799 information. 

800 

801 Returns 

802 ------- 

803 datastores : `Iterable` [ `str` ] 

804 All the matching datastores holding this dataset. 

805 

806 Raises 

807 ------ 

808 AmbiguousDatasetError 

809 Raised if ``ref.id`` is `None`. 

810 """ 

811 return self._datastoreBridges.findDatastores(ref) 

812 

813 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

814 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds): 

815 """Expand a dimension-based data ID to include additional information. 

816 

817 Parameters 

818 ---------- 

819 dataId : `DataCoordinate` or `dict`, optional 

820 Data ID to be expanded; augmented and overridden by ``kwds``. 

821 graph : `DimensionGraph`, optional 

822 Set of dimensions for the expanded ID. If `None`, the dimensions 

823 will be inferred from the keys of ``dataId`` and ``kwds``. 

824 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

825 are silently ignored, providing a way to extract and expand a 

826 subset of a data ID. 

827 records : mapping [`DimensionElement`, `DimensionRecord`], optional 

828 Dimension record data to use before querying the database for that 

829 data. 

830 **kwds 

831 Additional keywords are treated like additional key-value pairs for 

832 ``dataId``, extending and overriding 

833 

834 Returns 

835 ------- 

836 expanded : `ExpandedDataCoordinate` 

837 A data ID that includes full metadata for all of the dimensions it 

838 identifieds. 

839 """ 

840 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds) 

841 if isinstance(standardized, ExpandedDataCoordinate): 

842 return standardized 

843 elif isinstance(dataId, ExpandedDataCoordinate): 

844 records = dict(records) if records is not None else {} 

845 records.update(dataId.records) 

846 else: 

847 records = dict(records) if records is not None else {} 

848 keys = dict(standardized) 

849 regions = [] 

850 timespans = [] 

851 for element in standardized.graph.primaryKeyTraversalOrder: 

852 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

853 if record is ...: 

854 storage = self._dimensions[element] 

855 record = storage.fetch(keys) 

856 records[element] = record 

857 if record is not None: 

858 for d in element.implied: 

859 value = getattr(record, d.name) 

860 if keys.setdefault(d, value) != value: 

861 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, " 

862 f"but {element.name} implies {d.name}={value!r}.") 

863 if element in standardized.graph.spatial and record.region is not None: 

864 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions): 

865 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} " 

866 f"is disjoint with those for other elements.") 

867 regions.append(record.region) 

868 if element in standardized.graph.temporal: 

869 if any(not record.timespan.overlaps(t) for t in timespans): 

870 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}" 

871 f" is disjoint with those for other elements.") 

872 timespans.append(record.timespan) 

873 else: 

874 if element in standardized.graph.required: 

875 raise LookupError( 

876 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

877 ) 

878 if element.alwaysJoin: 

879 raise InconsistentDataIdError( 

880 f"Could not fetch record for element {element.name} via keys {keys}, ", 

881 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

882 "related." 

883 ) 

884 records.update((d, None) for d in element.implied) 

885 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

886 

887 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]: 

888 """Compare the keys and values of a pair of data IDs for consistency. 

889 

890 See `ConsistentDataIds` for more information. 

891 

892 Parameters 

893 ---------- 

894 a : `dict` or `DataCoordinate` 

895 First data ID to be compared. 

896 b : `dict` or `DataCoordinate` 

897 Second data ID to be compared. 

898 

899 Returns 

900 ------- 

901 relationship : `ConsistentDataIds` or `None` 

902 Relationship information. This is not `None` and coerces to 

903 `True` in boolean contexts if and only if the data IDs are 

904 consistent in terms of all common key-value pairs, all many-to-many 

905 join tables, and all spatial andtemporal relationships. 

906 """ 

907 a = DataCoordinate.standardize(a, universe=self.dimensions) 

908 b = DataCoordinate.standardize(b, universe=self.dimensions) 

909 aFull = getattr(a, "full", None) 

910 bFull = getattr(b, "full", None) 

911 aBest = aFull if aFull is not None else a 

912 bBest = bFull if bFull is not None else b 

913 jointKeys = aBest.keys() & bBest.keys() 

914 # If any common values are not equal, we know they are inconsistent. 

915 if any(aBest[k] != bBest[k] for k in jointKeys): 

916 return None 

917 # If the graphs are equal, we know the data IDs are. 

918 if a.graph == b.graph: 

919 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys)) 

920 # Result is still inconclusive. Try to expand a data ID containing 

921 # keys from both; that will fail if they are inconsistent. 

922 # First, if either input was already an ExpandedDataCoordinate, extract 

923 # its records so we don't have to query for them. 

924 records = {} 

925 if hasattr(a, "records"): 

926 records.update(a.records) 

927 if hasattr(b, "records"): 

928 records.update(b.records) 

929 try: 

930 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records) 

931 except InconsistentDataIdError: 

932 return None 

933 # We know the answer is not `None`; time to figure out what it is. 

934 return ConsistentDataIds( 

935 contains=(a.graph >= b.graph), 

936 within=(a.graph <= b.graph), 

937 overlaps=bool(a.graph & b.graph), 

938 ) 

939 

940 def insertDimensionData(self, element: Union[DimensionElement, str], 

941 *data: Union[dict, DimensionRecord], 

942 conform: bool = True): 

943 """Insert one or more dimension records into the database. 

944 

945 Parameters 

946 ---------- 

947 element : `DimensionElement` or `str` 

948 The `DimensionElement` or name thereof that identifies the table 

949 records will be inserted into. 

950 data : `dict` or `DimensionRecord` (variadic) 

951 One or more records to insert. 

952 conform : `bool`, optional 

953 If `False` (`True` is default) perform no checking or conversions, 

954 and assume that ``element`` is a `DimensionElement` instance and 

955 ``data`` is a one or more `DimensionRecord` instances of the 

956 appropriate subclass. 

957 """ 

958 if conform: 

959 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

960 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

961 for row in data] 

962 else: 

963 records = data 

964 storage = self._dimensions[element] 

965 storage.insert(*records) 

966 

967 def syncDimensionData(self, element: Union[DimensionElement, str], 

968 row: Union[dict, DimensionRecord], 

969 conform: bool = True) -> bool: 

970 """Synchronize the given dimension record with the database, inserting 

971 if it does not already exist and comparing values if it does. 

972 

973 Parameters 

974 ---------- 

975 element : `DimensionElement` or `str` 

976 The `DimensionElement` or name thereof that identifies the table 

977 records will be inserted into. 

978 row : `dict` or `DimensionRecord` 

979 The record to insert. 

980 conform : `bool`, optional 

981 If `False` (`True` is default) perform no checking or conversions, 

982 and assume that ``element`` is a `DimensionElement` instance and 

983 ``data`` is a one or more `DimensionRecord` instances of the 

984 appropriate subclass. 

985 

986 Returns 

987 ------- 

988 inserted : `bool` 

989 `True` if a new row was inserted, `False` otherwise. 

990 

991 Raises 

992 ------ 

993 ConflictingDefinitionError 

994 Raised if the record exists in the database (according to primary 

995 key lookup) but is inconsistent with the given one. 

996 

997 Notes 

998 ----- 

999 This method cannot be called within transactions, as it needs to be 

1000 able to perform its own transaction to be concurrent. 

1001 """ 

1002 if conform: 

1003 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1004 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1005 else: 

1006 record = row 

1007 storage = self._dimensions[element] 

1008 return storage.sync(record) 

1009 

1010 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1011 ) -> Iterator[DatasetType]: 

1012 """Iterate over the dataset types whose names match an expression. 

1013 

1014 Parameters 

1015 ---------- 

1016 expression : `Any`, optional 

1017 An expression that fully or partially identifies the dataset types 

1018 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1019 `...` can be used to return all dataset types, and is the default. 

1020 See :ref:`daf_butler_dataset_type_expressions` for more 

1021 information. 

1022 components : `bool`, optional 

1023 If `True`, apply all expression patterns to component dataset type 

1024 names as well. If `False`, never apply patterns to components. 

1025 If `None` (default), apply patterns to components only if their 

1026 parent datasets were not matched by the expression. 

1027 Fully-specified component datasets (`str` or `DatasetType` 

1028 instances) are always included. 

1029 

1030 Yields 

1031 ------ 

1032 datasetType : `DatasetType` 

1033 A `DatasetType` instance whose name matches ``expression``. 

1034 """ 

1035 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1036 if wildcard is ...: 

1037 for datasetType in self._datasets: 

1038 if components or not datasetType.isComponent(): 

1039 yield datasetType 

1040 return 

1041 done = set() 

1042 for name in wildcard.strings: 

1043 storage = self._datasets.find(name) 

1044 if storage is not None: 

1045 done.add(storage.datasetType) 

1046 yield storage.datasetType 

1047 if wildcard.patterns: 

1048 # If components (the argument) is None, we'll save component 

1049 # dataset that we might want to match, but only if their parents 

1050 # didn't get included. 

1051 componentsForLater = [] 

1052 for datasetType in self._datasets: 

1053 if datasetType.name in done: 

1054 continue 

1055 parentName, componentName = datasetType.nameAndComponent() 

1056 if componentName is not None and not components: 

1057 if components is None and parentName not in done: 

1058 componentsForLater.append(datasetType) 

1059 continue 

1060 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1061 done.add(datasetType.name) 

1062 yield datasetType 

1063 # Go back and try to match saved components. 

1064 for datasetType in componentsForLater: 

1065 parentName, _ = datasetType.nameAndComponent() 

1066 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1067 yield datasetType 

1068 

1069 def queryCollections(self, expression: Any = ..., 

1070 datasetType: Optional[DatasetType] = None, 

1071 collectionType: Optional[CollectionType] = None, 

1072 flattenChains: bool = False, 

1073 includeChains: Optional[bool] = None) -> Iterator[str]: 

1074 """Iterate over the collections whose names match an expression. 

1075 

1076 Parameters 

1077 ---------- 

1078 expression : `Any`, optional 

1079 An expression that fully or partially identifies the collections 

1080 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1081 `...` can be used to return all collections, and is the default. 

1082 See :ref:`daf_butler_collection_expressions` for more 

1083 information. 

1084 datasetType : `DatasetType`, optional 

1085 If provided, only yield collections that should be searched for 

1086 this dataset type according to ``expression``. If this is 

1087 not provided, any dataset type restrictions in ``expression`` are 

1088 ignored. 

1089 collectionType : `CollectionType`, optional 

1090 If provided, only yield collections of this type. 

1091 flattenChains : `bool`, optional 

1092 If `True` (`False` is default), recursively yield the child 

1093 collections of matching `~CollectionType.CHAINED` collections. 

1094 includeChains : `bool`, optional 

1095 If `True`, yield records for matching `~CollectionType.CHAINED` 

1096 collections. Default is the opposite of ``flattenChains``: include 

1097 either CHAINED collections or their children, but not both. 

1098 

1099 Yields 

1100 ------ 

1101 collection : `str` 

1102 The name of a collection that matches ``expression``. 

1103 """ 

1104 query = CollectionQuery.fromExpression(expression) 

1105 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1106 flattenChains=flattenChains, includeChains=includeChains): 

1107 yield record.name 

1108 

1109 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1110 """Return a `QueryBuilder` instance capable of constructing and 

1111 managing more complex queries than those obtainable via `Registry` 

1112 interfaces. 

1113 

1114 This is an advanced interface; downstream code should prefer 

1115 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1116 are sufficient. 

1117 

1118 Parameters 

1119 ---------- 

1120 summary : `QuerySummary` 

1121 Object describing and categorizing the full set of dimensions that 

1122 will be included in the query. 

1123 

1124 Returns 

1125 ------- 

1126 builder : `QueryBuilder` 

1127 Object that can be used to construct and perform advanced queries. 

1128 """ 

1129 return QueryBuilder(summary=summary, 

1130 collections=self._collections, 

1131 dimensions=self._dimensions, 

1132 datasets=self._datasets) 

1133 

1134 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1135 dataId: Optional[DataId] = None, 

1136 datasets: Any = None, 

1137 collections: Any = None, 

1138 where: Optional[str] = None, 

1139 expand: bool = True, 

1140 components: Optional[bool] = None, 

1141 **kwds) -> Iterator[DataCoordinate]: 

1142 """Query for and iterate over data IDs matching user-provided criteria. 

1143 

1144 Parameters 

1145 ---------- 

1146 dimensions : `Dimension` or `str`, or iterable thereof 

1147 The dimensions of the data IDs to yield, as either `Dimension` 

1148 instances or `str`. Will be automatically expanded to a complete 

1149 `DimensionGraph`. 

1150 dataId : `dict` or `DataCoordinate`, optional 

1151 A data ID whose key-value pairs are used as equality constraints 

1152 in the query. 

1153 datasets : `Any`, optional 

1154 An expression that fully or partially identifies dataset types 

1155 that should constrain the yielded data IDs. For example, including 

1156 "raw" here would constrain the yielded ``instrument``, 

1157 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1158 those for which at least one "raw" dataset exists in 

1159 ``collections``. Allowed types include `DatasetType`, `str`, 

1160 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1161 expressions, `...` is not permitted - it doesn't make sense to 

1162 constrain data IDs on the existence of *all* datasets. 

1163 See :ref:`daf_butler_dataset_type_expressions` for more 

1164 information. 

1165 collections: `Any`, optional 

1166 An expression that fully or partially identifies the collections 

1167 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1168 thereof. `...` can be used to return all collections. Must be 

1169 provided if ``datasets`` is, and is ignored if it is not. See 

1170 :ref:`daf_butler_collection_expressions` for more information. 

1171 where : `str`, optional 

1172 A string expression similar to a SQL WHERE clause. May involve 

1173 any column of a dimension table or (as a shortcut for the primary 

1174 key column of a dimension table) dimension name. See 

1175 :ref:`daf_butler_dimension_expressions` for more information. 

1176 expand : `bool`, optional 

1177 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1178 minimal `DataCoordinate` base-class instances. 

1179 components : `bool`, optional 

1180 If `True`, apply all dataset expression patterns to component 

1181 dataset type names as well. If `False`, never apply patterns to 

1182 components. If `None` (default), apply patterns to components only 

1183 if their parent datasets were not matched by the expression. 

1184 Fully-specified component datasets (`str` or `DatasetType` 

1185 instances) are always included. 

1186 kwds 

1187 Additional keyword arguments are forwarded to 

1188 `DataCoordinate.standardize` when processing the ``dataId`` 

1189 argument (and may be used to provide a constraining data ID even 

1190 when the ``dataId`` argument is `None`). 

1191 

1192 Yields 

1193 ------ 

1194 dataId : `DataCoordinate` 

1195 Data IDs matching the given query parameters. Order is 

1196 unspecified. 

1197 """ 

1198 dimensions = iterable(dimensions) 

1199 standardizedDataId = self.expandDataId(dataId, **kwds) 

1200 standardizedDatasetTypes = set() 

1201 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1202 if datasets is not None: 

1203 if collections is None: 

1204 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1205 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1206 requestedDimensionNames.update(datasetType.dimensions.names) 

1207 # If any matched dataset type is a component, just operate on 

1208 # its parent instead, because Registry doesn't know anything 

1209 # about what components exist, and here (unlike queryDatasets) 

1210 # we don't care about returning them. 

1211 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1212 if componentName is not None: 

1213 datasetType = self.getDatasetType(parentDatasetTypeName) 

1214 standardizedDatasetTypes.add(datasetType) 

1215 # Preprocess collections expression in case the original included 

1216 # single-pass iterators (we'll want to use it multiple times 

1217 # below). 

1218 collections = CollectionQuery.fromExpression(collections) 

1219 

1220 summary = QuerySummary( 

1221 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1222 dataId=standardizedDataId, 

1223 expression=where, 

1224 ) 

1225 builder = self.makeQueryBuilder(summary) 

1226 for datasetType in standardizedDatasetTypes: 

1227 builder.joinDataset(datasetType, collections, isResult=False) 

1228 query = builder.finish() 

1229 predicate = query.predicate() 

1230 for row in self._db.query(query.sql): 

1231 if predicate(row): 

1232 result = query.extractDataId(row) 

1233 if expand: 

1234 yield self.expandDataId(result, records=standardizedDataId.records) 

1235 else: 

1236 yield result 

1237 

1238 def queryDatasets(self, datasetType: Any, *, 

1239 collections: Any, 

1240 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1241 dataId: Optional[DataId] = None, 

1242 where: Optional[str] = None, 

1243 deduplicate: bool = False, 

1244 expand: bool = True, 

1245 components: Optional[bool] = None, 

1246 **kwds) -> Iterator[DatasetRef]: 

1247 """Query for and iterate over dataset references matching user-provided 

1248 criteria. 

1249 

1250 Parameters 

1251 ---------- 

1252 datasetType 

1253 An expression that fully or partially identifies the dataset types 

1254 to be queried. Allowed types include `DatasetType`, `str`, 

1255 `re.Pattern`, and iterables thereof. The special value `...` can 

1256 be used to query all dataset types. See 

1257 :ref:`daf_butler_dataset_type_expressions` for more information. 

1258 collections 

1259 An expression that fully or partially identifies the collections 

1260 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1261 thereof. `...` can be used to return all collections. See 

1262 :ref:`daf_butler_collection_expressions` for more information. 

1263 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1264 Dimensions to include in the query (in addition to those used 

1265 to identify the queried dataset type(s)), either to constrain 

1266 the resulting datasets to those for which a matching dimension 

1267 exists, or to relate the dataset type's dimensions to dimensions 

1268 referenced by the ``dataId`` or ``where`` arguments. 

1269 dataId : `dict` or `DataCoordinate`, optional 

1270 A data ID whose key-value pairs are used as equality constraints 

1271 in the query. 

1272 where : `str`, optional 

1273 A string expression similar to a SQL WHERE clause. May involve 

1274 any column of a dimension table or (as a shortcut for the primary 

1275 key column of a dimension table) dimension name. See 

1276 :ref:`daf_butler_dimension_expressions` for more information. 

1277 deduplicate : `bool`, optional 

1278 If `True` (`False` is default), for each result data ID, only 

1279 yield one `DatasetRef` of each `DatasetType`, from the first 

1280 collection in which a dataset of that dataset type appears 

1281 (according to the order of ``collections`` passed in). If `True`, 

1282 ``collections`` must not contain regular expressions and may not 

1283 be `...`. 

1284 expand : `bool`, optional 

1285 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1286 minimal `DataCoordinate` base-class instances. 

1287 components : `bool`, optional 

1288 If `True`, apply all dataset expression patterns to component 

1289 dataset type names as well. If `False`, never apply patterns to 

1290 components. If `None` (default), apply patterns to components only 

1291 if their parent datasets were not matched by the expression. 

1292 Fully-specified component datasets (`str` or `DatasetType` 

1293 instances) are always included. 

1294 kwds 

1295 Additional keyword arguments are forwarded to 

1296 `DataCoordinate.standardize` when processing the ``dataId`` 

1297 argument (and may be used to provide a constraining data ID even 

1298 when the ``dataId`` argument is `None`). 

1299 

1300 Yields 

1301 ------ 

1302 ref : `DatasetRef` 

1303 Dataset references matching the given query criteria. These 

1304 are grouped by `DatasetType` if the query evaluates to multiple 

1305 dataset types, but order is otherwise unspecified. 

1306 

1307 Raises 

1308 ------ 

1309 TypeError 

1310 Raised when the arguments are incompatible, such as when a 

1311 collection wildcard is passed when ``deduplicate`` is `True`. 

1312 

1313 Notes 

1314 ----- 

1315 When multiple dataset types are queried in a single call, the 

1316 results of this operation are equivalent to querying for each dataset 

1317 type separately in turn, and no information about the relationships 

1318 between datasets of different types is included. In contexts where 

1319 that kind of information is important, the recommended pattern is to 

1320 use `queryDimensions` to first obtain data IDs (possibly with the 

1321 desired dataset types and collections passed as constraints to the 

1322 query), and then use multiple (generally much simpler) calls to 

1323 `queryDatasets` with the returned data IDs passed as constraints. 

1324 """ 

1325 # Standardize the collections expression. 

1326 if deduplicate: 

1327 collections = CollectionSearch.fromExpression(collections) 

1328 else: 

1329 collections = CollectionQuery.fromExpression(collections) 

1330 # Standardize and expand the data ID provided as a constraint. 

1331 standardizedDataId = self.expandDataId(dataId, **kwds) 

1332 

1333 # We can only query directly if given a non-component DatasetType 

1334 # instance. If we were given an expression or str or a component 

1335 # DatasetType instance, we'll populate this dict, recurse, and return. 

1336 # If we already have a non-component DatasetType, it will remain None 

1337 # and we'll run the query directly. 

1338 composition: Optional[ 

1339 Dict[ 

1340 DatasetType, # parent dataset type 

1341 List[Optional[str]] # component name, or None for parent 

1342 ] 

1343 ] = None 

1344 if not isinstance(datasetType, DatasetType): 

1345 # We were given a dataset type expression (which may be as simple 

1346 # as a str). Loop over all matching datasets, delegating handling 

1347 # of the `components` argument to queryDatasetTypes, as we populate 

1348 # the composition dict. 

1349 composition = defaultdict(list) 

1350 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1351 parentName, componentName = trueDatasetType.nameAndComponent() 

1352 if componentName is not None: 

1353 parentDatasetType = self.getDatasetType(parentName) 

1354 composition.setdefault(parentDatasetType, []).append(componentName) 

1355 else: 

1356 composition.setdefault(trueDatasetType, []).append(None) 

1357 elif datasetType.isComponent(): 

1358 # We were given a true DatasetType instance, but it's a component. 

1359 # the composition dict will have exactly one item. 

1360 parentName, componentName = datasetType.nameAndComponent() 

1361 parentDatasetType = self.getDatasetType(parentName) 

1362 composition = {parentDatasetType: [componentName]} 

1363 if composition is not None: 

1364 # We need to recurse. Do that once for each parent dataset type. 

1365 for parentDatasetType, componentNames in composition.items(): 

1366 for parentRef in self.queryDatasets(parentDatasetType, collections=collections, 

1367 dimensions=dimensions, dataId=standardizedDataId, 

1368 where=where, deduplicate=deduplicate): 

1369 # Loop over components, yielding one for each one for each 

1370 # one requested. 

1371 for componentName in componentNames: 

1372 if componentName is None: 

1373 yield parentRef 

1374 else: 

1375 yield parentRef.makeComponentRef(componentName) 

1376 return 

1377 # If we get here, there's no need to recurse (or we are already 

1378 # recursing; there can only ever be one level of recursion). 

1379 

1380 # The full set of dimensions in the query is the combination of those 

1381 # needed for the DatasetType and those explicitly requested, if any. 

1382 requestedDimensionNames = set(datasetType.dimensions.names) 

1383 if dimensions is not None: 

1384 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1385 # Construct the summary structure needed to construct a QueryBuilder. 

1386 summary = QuerySummary( 

1387 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1388 dataId=standardizedDataId, 

1389 expression=where, 

1390 ) 

1391 builder = self.makeQueryBuilder(summary) 

1392 # Add the dataset subquery to the query, telling the QueryBuilder to 

1393 # include the rank of the selected collection in the results only if we 

1394 # need to deduplicate. Note that if any of the collections are 

1395 # actually wildcard expressions, and we've asked for deduplication, 

1396 # this will raise TypeError for us. 

1397 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1398 return 

1399 query = builder.finish() 

1400 predicate = query.predicate() 

1401 if not deduplicate: 

1402 # No need to de-duplicate across collections. 

1403 for row in self._db.query(query.sql): 

1404 if predicate(row): 

1405 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1406 if expand: 

1407 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1408 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1409 else: 

1410 # For each data ID, yield only the DatasetRef with the lowest 

1411 # collection rank. 

1412 bestRefs = {} 

1413 bestRanks = {} 

1414 for row in self._db.query(query.sql): 

1415 if predicate(row): 

1416 ref, rank = query.extractDatasetRef(row, datasetType) 

1417 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1418 if rank < bestRank: 

1419 bestRefs[ref.dataId] = ref 

1420 bestRanks[ref.dataId] = rank 

1421 # If caller requested expanded data IDs, we defer that until here 

1422 # so we do as little expansion as possible. 

1423 if expand: 

1424 for ref in bestRefs.values(): 

1425 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1426 yield ref.expanded(dataId) 

1427 else: 

1428 yield from bestRefs.values() 

1429 

1430 dimensions: DimensionUniverse 

1431 """The universe of all dimensions known to the registry 

1432 (`DimensionUniverse`). 

1433 """ 

1434 

1435 storageClasses: StorageClassFactory 

1436 """All storage classes known to the registry (`StorageClassFactory`). 

1437 """