Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "ConsistentDataIds", 

26 "Registry", 

27) 

28 

29from collections import defaultdict 

30import contextlib 

31from dataclasses import dataclass 

32import sys 

33from typing import ( 

34 Any, 

35 Dict, 

36 Iterable, 

37 Iterator, 

38 List, 

39 Mapping, 

40 Optional, 

41 Set, 

42 Type, 

43 TYPE_CHECKING, 

44 Union, 

45) 

46 

47import astropy.time 

48import sqlalchemy 

49 

50import lsst.sphgeom 

51from ..core import ( 

52 Config, 

53 DataCoordinate, 

54 DataId, 

55 DatasetRef, 

56 DatasetType, 

57 ddl, 

58 Dimension, 

59 DimensionElement, 

60 DimensionGraph, 

61 DimensionRecord, 

62 DimensionUniverse, 

63 ExpandedDataCoordinate, 

64 NamedKeyDict, 

65 Timespan, 

66 StorageClassFactory, 

67) 

68from ..core.utils import doImport, iterable, transactional 

69from ._config import RegistryConfig 

70from .queries import ( 

71 QueryBuilder, 

72 QuerySummary, 

73) 

74from ._collectionType import CollectionType 

75from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

76from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

77from .interfaces import ChainedCollectionRecord, RunRecord 

78 

79if TYPE_CHECKING: 79 ↛ 80line 79 didn't jump to line 80, because the condition on line 79 was never true

80 from ..butlerConfig import ButlerConfig 

81 from .interfaces import ( 

82 CollectionManager, 

83 Database, 

84 OpaqueTableStorageManager, 

85 DimensionRecordStorageManager, 

86 DatasetRecordStorageManager, 

87 DatastoreRegistryBridgeManager, 

88 ) 

89 

90 

91@dataclass 

92class ConsistentDataIds: 

93 """A struct used to report relationships between data IDs by 

94 `Registry.relateDataIds`. 

95 

96 If an instance of this class is returned (instead of `None`), the data IDs 

97 are "not inconsistent" - any keys they have in common have the same value, 

98 and any spatial or temporal relationships they have at least might involve 

99 an overlap. To capture this, any instance of `ConsistentDataIds` coerces 

100 to `True` in boolean contexts. 

101 """ 

102 

103 overlaps: bool 

104 """If `True`, the data IDs have at least one key in common, associated with 

105 the same value. 

106 

107 Note that data IDs are not inconsistent even if overlaps is `False` - they 

108 may simply have no keys in common, which means they cannot have 

109 inconsistent values for any keys. They may even be equal, in the case that 

110 both data IDs are empty. 

111 

112 This field does _not_ indicate whether a spatial or temporal overlap 

113 relationship exists. 

114 """ 

115 

116 contains: bool 

117 """If `True`, all keys in the first data ID are in the second, and are 

118 associated with the same values. 

119 

120 This includes case where the first data ID is empty. 

121 """ 

122 

123 within: bool 

124 """If `True`, all keys in the second data ID are in the first, and are 

125 associated with the same values. 

126 

127 This includes case where the second data ID is empty. 

128 """ 

129 

130 @property 

131 def equal(self) -> bool: 

132 """If `True`, the two data IDs are the same. 

133 

134 Data IDs are equal if they have both a `contains` and a `within` 

135 relationship. 

136 """ 

137 return self.contains and self.within 

138 

139 @property 

140 def disjoint(self) -> bool: 

141 """If `True`, the two data IDs have no keys in common. 

142 

143 This is simply the oppose of `overlaps`. Disjoint datasets are by 

144 definition not inconsistent. 

145 """ 

146 return not self.overlaps 

147 

148 def __bool__(self) -> bool: 

149 return True 

150 

151 

152class Registry: 

153 """Registry interface. 

154 

155 Parameters 

156 ---------- 

157 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

158 Registry configuration 

159 """ 

160 

161 defaultConfigFile = None 

162 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

163 absolute path. Can be None if no defaults specified. 

164 """ 

165 

166 @classmethod 

167 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

168 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

169 """Create `Registry` subclass instance from `config`. 

170 

171 Uses ``registry.cls`` from `config` to determine which subclass to 

172 instantiate. 

173 

174 Parameters 

175 ---------- 

176 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

177 Registry configuration 

178 create : `bool`, optional 

179 Assume empty Registry and create a new one. 

180 butlerRoot : `str`, optional 

181 Path to the repository root this `Registry` will manage. 

182 writeable : `bool`, optional 

183 If `True` (default) create a read-write connection to the database. 

184 

185 Returns 

186 ------- 

187 registry : `Registry` (subclass) 

188 A new `Registry` subclass instance. 

189 """ 

190 if not isinstance(config, RegistryConfig): 

191 if isinstance(config, str) or isinstance(config, Config): 

192 config = RegistryConfig(config) 

193 else: 

194 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

195 config.replaceRoot(butlerRoot) 

196 DatabaseClass = config.getDatabaseClass() 

197 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

198 namespace=config.get("namespace"), writeable=writeable) 

199 universe = DimensionUniverse(config) 

200 opaque = doImport(config["managers", "opaque"]) 

201 dimensions = doImport(config["managers", "dimensions"]) 

202 collections = doImport(config["managers", "collections"]) 

203 datasets = doImport(config["managers", "datasets"]) 

204 datastoreBridges = doImport(config["managers", "datastores"]) 

205 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

206 datasets=datasets, datastoreBridges=datastoreBridges, create=create) 

207 

208 def __init__(self, database: Database, universe: DimensionUniverse, *, 

209 opaque: Type[OpaqueTableStorageManager], 

210 dimensions: Type[DimensionRecordStorageManager], 

211 collections: Type[CollectionManager], 

212 datasets: Type[DatasetRecordStorageManager], 

213 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

214 create: bool = False): 

215 self._db = database 

216 self.storageClasses = StorageClassFactory() 

217 with self._db.declareStaticTables(create=create) as context: 

218 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

219 self._collections = collections.initialize(self._db, context) 

220 self._datasets = datasets.initialize(self._db, context, 

221 collections=self._collections, 

222 universe=self.dimensions) 

223 self._opaque = opaque.initialize(self._db, context) 

224 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

225 opaque=self._opaque, 

226 datasets=datasets, 

227 universe=self.dimensions) 

228 self._collections.refresh() 

229 self._datasets.refresh(universe=self._dimensions.universe) 

230 

231 def __str__(self) -> str: 

232 return str(self._db) 

233 

234 def __repr__(self) -> str: 

235 return f"Registry({self._db!r}, {self.dimensions!r})" 

236 

237 def isWriteable(self) -> bool: 

238 """Return `True` if this registry allows write operations, and `False` 

239 otherwise. 

240 """ 

241 return self._db.isWriteable() 

242 

243 @property 

244 def dimensions(self) -> DimensionUniverse: 

245 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

246 """ 

247 return self._dimensions.universe 

248 

249 @contextlib.contextmanager 

250 def transaction(self) -> Iterator[None]: 

251 """Return a context manager that represents a transaction. 

252 """ 

253 # TODO make savepoint=False the default. 

254 try: 

255 with self._db.transaction(): 

256 yield 

257 except BaseException: 

258 # TODO: this clears the caches sometimes when we wouldn't actually 

259 # need to. Can we avoid that? 

260 self._dimensions.clearCaches() 

261 raise 

262 

263 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

264 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

265 other data repository client. 

266 

267 Opaque table records can be added via `insertOpaqueData`, retrieved via 

268 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

269 

270 Parameters 

271 ---------- 

272 tableName : `str` 

273 Logical name of the opaque table. This may differ from the 

274 actual name used in the database by a prefix and/or suffix. 

275 spec : `ddl.TableSpec` 

276 Specification for the table to be added. 

277 """ 

278 self._opaque.register(tableName, spec) 

279 

280 @transactional 

281 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

282 """Insert records into an opaque table. 

283 

284 Parameters 

285 ---------- 

286 tableName : `str` 

287 Logical name of the opaque table. Must match the name used in a 

288 previous call to `registerOpaqueTable`. 

289 data 

290 Each additional positional argument is a dictionary that represents 

291 a single row to be added. 

292 """ 

293 self._opaque[tableName].insert(*data) 

294 

295 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

296 """Retrieve records from an opaque table. 

297 

298 Parameters 

299 ---------- 

300 tableName : `str` 

301 Logical name of the opaque table. Must match the name used in a 

302 previous call to `registerOpaqueTable`. 

303 where 

304 Additional keyword arguments are interpreted as equality 

305 constraints that restrict the returned rows (combined with AND); 

306 keyword arguments are column names and values are the values they 

307 must have. 

308 

309 Yields 

310 ------ 

311 row : `dict` 

312 A dictionary representing a single result row. 

313 """ 

314 yield from self._opaque[tableName].fetch(**where) 

315 

316 @transactional 

317 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

318 """Remove records from an opaque table. 

319 

320 Parameters 

321 ---------- 

322 tableName : `str` 

323 Logical name of the opaque table. Must match the name used in a 

324 previous call to `registerOpaqueTable`. 

325 where 

326 Additional keyword arguments are interpreted as equality 

327 constraints that restrict the deleted rows (combined with AND); 

328 keyword arguments are column names and values are the values they 

329 must have. 

330 """ 

331 self._opaque[tableName].delete(**where) 

332 

333 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None: 

334 """Add a new collection if one with the given name does not exist. 

335 

336 Parameters 

337 ---------- 

338 name : `str` 

339 The name of the collection to create. 

340 type : `CollectionType` 

341 Enum value indicating the type of collection to create. 

342 

343 Notes 

344 ----- 

345 This method cannot be called within transactions, as it needs to be 

346 able to perform its own transaction to be concurrent. 

347 """ 

348 self._collections.register(name, type) 

349 

350 def getCollectionType(self, name: str) -> CollectionType: 

351 """Return an enumeration value indicating the type of the given 

352 collection. 

353 

354 Parameters 

355 ---------- 

356 name : `str` 

357 The name of the collection. 

358 

359 Returns 

360 ------- 

361 type : `CollectionType` 

362 Enum value indicating the type of this collection. 

363 

364 Raises 

365 ------ 

366 MissingCollectionError 

367 Raised if no collection with the given name exists. 

368 """ 

369 return self._collections.find(name).type 

370 

371 def registerRun(self, name: str) -> None: 

372 """Add a new run if one with the given name does not exist. 

373 

374 Parameters 

375 ---------- 

376 name : `str` 

377 The name of the run to create. 

378 

379 Notes 

380 ----- 

381 This method cannot be called within transactions, as it needs to be 

382 able to perform its own transaction to be concurrent. 

383 """ 

384 self._collections.register(name, CollectionType.RUN) 

385 

386 @transactional 

387 def removeCollection(self, name: str) -> None: 

388 """Completely remove the given collection. 

389 

390 Parameters 

391 ---------- 

392 name : `str` 

393 The name of the collection to remove. 

394 

395 Raises 

396 ------ 

397 MissingCollectionError 

398 Raised if no collection with the given name exists. 

399 

400 Notes 

401 ----- 

402 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

403 in it are also fully removed. This requires that those datasets be 

404 removed (or at least trashed) from any datastores that hold them first. 

405 

406 A collection may not be deleted as long as it is referenced by a 

407 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

408 be deleted or redefined first. 

409 """ 

410 self._collections.remove(name) 

411 

412 def getCollectionChain(self, parent: str) -> CollectionSearch: 

413 """Return the child collections in a `~CollectionType.CHAINED` 

414 collection. 

415 

416 Parameters 

417 ---------- 

418 parent : `str` 

419 Name of the chained collection. Must have already been added via 

420 a call to `Registry.registerCollection`. 

421 

422 Returns 

423 ------- 

424 children : `CollectionSearch` 

425 An object that defines the search path of the collection. 

426 See :ref:`daf_butler_collection_expressions` for more information. 

427 

428 Raises 

429 ------ 

430 MissingCollectionError 

431 Raised if ``parent`` does not exist in the `Registry`. 

432 TypeError 

433 Raised if ``parent`` does not correspond to a 

434 `~CollectionType.CHAINED` collection. 

435 """ 

436 record = self._collections.find(parent) 

437 if record.type is not CollectionType.CHAINED: 

438 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

439 assert isinstance(record, ChainedCollectionRecord) 

440 return record.children 

441 

442 @transactional 

443 def setCollectionChain(self, parent: str, children: Any) -> None: 

444 """Define or redefine a `~CollectionType.CHAINED` collection. 

445 

446 Parameters 

447 ---------- 

448 parent : `str` 

449 Name of the chained collection. Must have already been added via 

450 a call to `Registry.registerCollection`. 

451 children : `Any` 

452 An expression defining an ordered search of child collections, 

453 generally an iterable of `str`. Restrictions on the dataset types 

454 to be searched can also be included, by passing mapping or an 

455 iterable containing tuples; see 

456 :ref:`daf_butler_collection_expressions` for more information. 

457 

458 Raises 

459 ------ 

460 MissingCollectionError 

461 Raised when any of the given collections do not exist in the 

462 `Registry`. 

463 TypeError 

464 Raised if ``parent`` does not correspond to a 

465 `~CollectionType.CHAINED` collection. 

466 ValueError 

467 Raised if the given collections contains a cycle. 

468 """ 

469 record = self._collections.find(parent) 

470 if record.type is not CollectionType.CHAINED: 

471 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

472 assert isinstance(record, ChainedCollectionRecord) 

473 children = CollectionSearch.fromExpression(children) 

474 if children != record.children: 

475 record.update(self._collections, children) 

476 

477 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

478 """ 

479 Add a new `DatasetType` to the Registry. 

480 

481 It is not an error to register the same `DatasetType` twice. 

482 

483 Parameters 

484 ---------- 

485 datasetType : `DatasetType` 

486 The `DatasetType` to be added. 

487 

488 Returns 

489 ------- 

490 inserted : `bool` 

491 `True` if ``datasetType`` was inserted, `False` if an identical 

492 existing `DatsetType` was found. Note that in either case the 

493 DatasetType is guaranteed to be defined in the Registry 

494 consistently with the given definition. 

495 

496 Raises 

497 ------ 

498 ValueError 

499 Raised if the dimensions or storage class are invalid. 

500 ConflictingDefinitionError 

501 Raised if this DatasetType is already registered with a different 

502 definition. 

503 

504 Notes 

505 ----- 

506 This method cannot be called within transactions, as it needs to be 

507 able to perform its own transaction to be concurrent. 

508 """ 

509 _, inserted = self._datasets.register(datasetType) 

510 return inserted 

511 

512 def getDatasetType(self, name: str) -> DatasetType: 

513 """Get the `DatasetType`. 

514 

515 Parameters 

516 ---------- 

517 name : `str` 

518 Name of the type. 

519 

520 Returns 

521 ------- 

522 type : `DatasetType` 

523 The `DatasetType` associated with the given name. 

524 

525 Raises 

526 ------ 

527 KeyError 

528 Requested named DatasetType could not be found in registry. 

529 """ 

530 storage = self._datasets.find(name) 

531 if storage is None: 

532 raise KeyError(f"DatasetType '{name}' could not be found.") 

533 return storage.datasetType 

534 

535 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

536 collections: Any, **kwargs: Any) -> Optional[DatasetRef]: 

537 """Find a dataset given its `DatasetType` and data ID. 

538 

539 This can be used to obtain a `DatasetRef` that permits the dataset to 

540 be read from a `Datastore`. If the dataset is a component and can not 

541 be found using the provided dataset type, a dataset ref for the parent 

542 will be returned instead but with the correct dataset type. 

543 

544 Parameters 

545 ---------- 

546 datasetType : `DatasetType` or `str` 

547 A `DatasetType` or the name of one. 

548 dataId : `dict` or `DataCoordinate`, optional 

549 A `dict`-like object containing the `Dimension` links that identify 

550 the dataset within a collection. 

551 collections 

552 An expression that fully or partially identifies the collections 

553 to search for the dataset, such as a `str`, `re.Pattern`, or 

554 iterable thereof. `...` can be used to return all collections. 

555 See :ref:`daf_butler_collection_expressions` for more information. 

556 **kwargs 

557 Additional keyword arguments passed to 

558 `DataCoordinate.standardize` to convert ``dataId`` to a true 

559 `DataCoordinate` or augment an existing one. 

560 

561 Returns 

562 ------- 

563 ref : `DatasetRef` 

564 A reference to the dataset, or `None` if no matching Dataset 

565 was found. 

566 

567 Raises 

568 ------ 

569 LookupError 

570 Raised if one or more data ID keys are missing or the dataset type 

571 does not exist. 

572 MissingCollectionError 

573 Raised if any of ``collections`` does not exist in the registry. 

574 """ 

575 if isinstance(datasetType, DatasetType): 

576 storage = self._datasets.find(datasetType.name) 

577 if storage is None: 

578 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

579 else: 

580 storage = self._datasets.find(datasetType) 

581 if storage is None: 

582 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

583 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

584 universe=self.dimensions, **kwargs) 

585 collections = CollectionSearch.fromExpression(collections) 

586 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

587 result = storage.find(collectionRecord, dataId) 

588 if result is not None: 

589 return result 

590 

591 # fallback to the parent if we got nothing and this was a component 

592 if storage.datasetType.isComponent(): 

593 parentType, _ = storage.datasetType.nameAndComponent() 

594 parentRef = self.findDataset(parentType, dataId, collections=collections, **kwargs) 

595 if parentRef is not None: 

596 # Should already conform and we know no components 

597 return DatasetRef(storage.datasetType, parentRef.dataId, id=parentRef.id, 

598 run=parentRef.run, conform=False, hasParentId=True) 

599 

600 return None 

601 

602 @transactional 

603 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

604 run: str) -> List[DatasetRef]: 

605 """Insert one or more datasets into the `Registry` 

606 

607 This always adds new datasets; to associate existing datasets with 

608 a new collection, use ``associate``. 

609 

610 Parameters 

611 ---------- 

612 datasetType : `DatasetType` or `str` 

613 A `DatasetType` or the name of one. 

614 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

615 Dimension-based identifiers for the new datasets. 

616 run : `str` 

617 The name of the run that produced the datasets. 

618 

619 Returns 

620 ------- 

621 refs : `list` of `DatasetRef` 

622 Resolved `DatasetRef` instances for all given data IDs (in the same 

623 order). 

624 

625 Raises 

626 ------ 

627 ConflictingDefinitionError 

628 If a dataset with the same dataset type and data ID as one of those 

629 given already exists in ``run``. 

630 MissingCollectionError 

631 Raised if ``run`` does not exist in the registry. 

632 """ 

633 if isinstance(datasetType, DatasetType): 

634 storage = self._datasets.find(datasetType.name) 

635 if storage is None: 

636 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

637 else: 

638 storage = self._datasets.find(datasetType) 

639 if storage is None: 

640 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

641 runRecord = self._collections.find(run) 

642 if runRecord.type is not CollectionType.RUN: 

643 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.") 

644 assert isinstance(runRecord, RunRecord) 

645 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

646 for dataId in dataIds] 

647 try: 

648 refs = list(storage.insert(runRecord, expandedDataIds)) 

649 except sqlalchemy.exc.IntegrityError as err: 

650 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

651 f"one or more datasets of type {storage.datasetType} into " 

652 f"collection '{run}'. " 

653 f"This probably means a dataset with the same data ID " 

654 f"and dataset type already exists, but it may also mean a " 

655 f"dimension row is missing.") from err 

656 return refs 

657 

658 def getDataset(self, id: int) -> Optional[DatasetRef]: 

659 """Retrieve a Dataset entry. 

660 

661 Parameters 

662 ---------- 

663 id : `int` 

664 The unique identifier for the dataset. 

665 

666 Returns 

667 ------- 

668 ref : `DatasetRef` or `None` 

669 A ref to the Dataset, or `None` if no matching Dataset 

670 was found. 

671 """ 

672 ref = self._datasets.getDatasetRef(id, universe=self.dimensions) 

673 if ref is None: 

674 return None 

675 return ref 

676 

677 @transactional 

678 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

679 """Remove datasets from the Registry. 

680 

681 The datasets will be removed unconditionally from all collections, and 

682 any `Quantum` that consumed this dataset will instead be marked with 

683 having a NULL input. `Datastore` records will *not* be deleted; the 

684 caller is responsible for ensuring that the dataset has already been 

685 removed from all Datastores. 

686 

687 Parameters 

688 ---------- 

689 refs : `Iterable` of `DatasetRef` 

690 References to the datasets to be removed. Must include a valid 

691 ``id`` attribute, and should be considered invalidated upon return. 

692 

693 Raises 

694 ------ 

695 AmbiguousDatasetError 

696 Raised if any ``ref.id`` is `None`. 

697 OrphanedRecordError 

698 Raised if any dataset is still present in any `Datastore`. 

699 """ 

700 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

701 storage = self._datasets.find(datasetType.name) 

702 assert storage is not None 

703 try: 

704 storage.delete(refsForType) 

705 except sqlalchemy.exc.IntegrityError as err: 

706 raise OrphanedRecordError("One or more datasets is still " 

707 "present in one or more Datastores.") from err 

708 

709 @transactional 

710 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

711 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

712 

713 If a DatasetRef with the same exact integer ID is already in a 

714 collection nothing is changed. If a `DatasetRef` with the same 

715 `DatasetType` and data ID but with different integer ID 

716 exists in the collection, `ConflictingDefinitionError` is raised. 

717 

718 Parameters 

719 ---------- 

720 collection : `str` 

721 Indicates the collection the datasets should be associated with. 

722 refs : `Iterable` [ `DatasetRef` ] 

723 An iterable of resolved `DatasetRef` instances that already exist 

724 in this `Registry`. 

725 

726 Raises 

727 ------ 

728 ConflictingDefinitionError 

729 If a Dataset with the given `DatasetRef` already exists in the 

730 given collection. 

731 AmbiguousDatasetError 

732 Raised if ``any(ref.id is None for ref in refs)``. 

733 MissingCollectionError 

734 Raised if ``collection`` does not exist in the registry. 

735 TypeError 

736 Raise adding new datasets to the given ``collection`` is not 

737 allowed. 

738 """ 

739 collectionRecord = self._collections.find(collection) 

740 if collectionRecord.type is not CollectionType.TAGGED: 

741 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

742 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

743 storage = self._datasets.find(datasetType.name) 

744 assert storage is not None 

745 try: 

746 storage.associate(collectionRecord, refsForType) 

747 except sqlalchemy.exc.IntegrityError as err: 

748 raise ConflictingDefinitionError( 

749 f"Constraint violation while associating dataset of type {datasetType.name} with " 

750 f"collection {collection}. This probably means that one or more datasets with the same " 

751 f"dataset type and data ID already exist in the collection, but it may also indicate " 

752 f"that the datasets do not exist." 

753 ) from err 

754 

755 @transactional 

756 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

757 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

758 

759 ``collection`` and ``ref`` combinations that are not currently 

760 associated are silently ignored. 

761 

762 Parameters 

763 ---------- 

764 collection : `str` 

765 The collection the datasets should no longer be associated with. 

766 refs : `Iterable` [ `DatasetRef` ] 

767 An iterable of resolved `DatasetRef` instances that already exist 

768 in this `Registry`. 

769 

770 Raises 

771 ------ 

772 AmbiguousDatasetError 

773 Raised if any of the given dataset references is unresolved. 

774 MissingCollectionError 

775 Raised if ``collection`` does not exist in the registry. 

776 TypeError 

777 Raise adding new datasets to the given ``collection`` is not 

778 allowed. 

779 """ 

780 collectionRecord = self._collections.find(collection) 

781 if collectionRecord.type is not CollectionType.TAGGED: 

782 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

783 "expected TAGGED.") 

784 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

785 storage = self._datasets.find(datasetType.name) 

786 assert storage is not None 

787 storage.disassociate(collectionRecord, refsForType) 

788 

789 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

790 """Return an object that allows a new `Datastore` instance to 

791 communicate with this `Registry`. 

792 

793 Returns 

794 ------- 

795 manager : `DatastoreRegistryBridgeManager` 

796 Object that mediates communication between this `Registry` and its 

797 associated datastores. 

798 """ 

799 return self._datastoreBridges 

800 

801 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

802 """Retrieve datastore locations for a given dataset. 

803 

804 Parameters 

805 ---------- 

806 ref : `DatasetRef` 

807 A reference to the dataset for which to retrieve storage 

808 information. 

809 

810 Returns 

811 ------- 

812 datastores : `Iterable` [ `str` ] 

813 All the matching datastores holding this dataset. 

814 

815 Raises 

816 ------ 

817 AmbiguousDatasetError 

818 Raised if ``ref.id`` is `None`. 

819 """ 

820 return self._datastoreBridges.findDatastores(ref) 

821 

822 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

823 records: Optional[Mapping[DimensionElement, Optional[DimensionRecord]]] = None, 

824 **kwargs: Any) -> ExpandedDataCoordinate: 

825 """Expand a dimension-based data ID to include additional information. 

826 

827 Parameters 

828 ---------- 

829 dataId : `DataCoordinate` or `dict`, optional 

830 Data ID to be expanded; augmented and overridden by ``kwds``. 

831 graph : `DimensionGraph`, optional 

832 Set of dimensions for the expanded ID. If `None`, the dimensions 

833 will be inferred from the keys of ``dataId`` and ``kwds``. 

834 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

835 are silently ignored, providing a way to extract and expand a 

836 subset of a data ID. 

837 records : `Mapping` [`DimensionElement`, `DimensionRecord`], optional 

838 Dimension record data to use before querying the database for that 

839 data. 

840 **kwargs 

841 Additional keywords are treated like additional key-value pairs for 

842 ``dataId``, extending and overriding 

843 

844 Returns 

845 ------- 

846 expanded : `ExpandedDataCoordinate` 

847 A data ID that includes full metadata for all of the dimensions it 

848 identifieds. 

849 """ 

850 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

851 if isinstance(standardized, ExpandedDataCoordinate): 

852 return standardized 

853 elif isinstance(dataId, ExpandedDataCoordinate): 

854 records = NamedKeyDict(records) if records is not None else NamedKeyDict() 

855 records.update(dataId.records) 

856 else: 

857 records = NamedKeyDict(records) if records is not None else NamedKeyDict() 

858 keys = dict(standardized.byName()) 

859 regions: List[lsst.sphgeom.ConvexPolygon] = [] 

860 timespans: List[Timespan[astropy.time.Time]] = [] 

861 for element in standardized.graph.primaryKeyTraversalOrder: 

862 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

863 if record is ...: 

864 storage = self._dimensions[element] 

865 record = storage.fetch(keys) 

866 records[element] = record 

867 if record is not None: 

868 for d in element.implied: 

869 value = getattr(record, d.name) 

870 if keys.setdefault(d.name, value) != value: 

871 raise InconsistentDataIdError( 

872 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

873 f"but {element.name} implies {d.name}={value!r}." 

874 ) 

875 if element in standardized.graph.spatial and record.region is not None: 

876 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions): 

877 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} " 

878 f"is disjoint with those for other elements.") 

879 regions.append(record.region) 

880 if element in standardized.graph.temporal: 

881 if any(not record.timespan.overlaps(t) for t in timespans): 

882 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}" 

883 f" is disjoint with those for other elements.") 

884 timespans.append(record.timespan) 

885 else: 

886 if element in standardized.graph.required: 

887 raise LookupError( 

888 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

889 ) 

890 if element.alwaysJoin: 

891 raise InconsistentDataIdError( 

892 f"Could not fetch record for element {element.name} via keys {keys}, ", 

893 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

894 "related." 

895 ) 

896 records.update((d, None) for d in element.implied) 

897 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

898 

899 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]: 

900 """Compare the keys and values of a pair of data IDs for consistency. 

901 

902 See `ConsistentDataIds` for more information. 

903 

904 Parameters 

905 ---------- 

906 a : `dict` or `DataCoordinate` 

907 First data ID to be compared. 

908 b : `dict` or `DataCoordinate` 

909 Second data ID to be compared. 

910 

911 Returns 

912 ------- 

913 relationship : `ConsistentDataIds` or `None` 

914 Relationship information. This is not `None` and coerces to 

915 `True` in boolean contexts if and only if the data IDs are 

916 consistent in terms of all common key-value pairs, all many-to-many 

917 join tables, and all spatial andtemporal relationships. 

918 """ 

919 a = DataCoordinate.standardize(a, universe=self.dimensions) 

920 b = DataCoordinate.standardize(b, universe=self.dimensions) 

921 aFull = getattr(a, "full", None) 

922 bFull = getattr(b, "full", None) 

923 aBest = aFull if aFull is not None else a 

924 bBest = bFull if bFull is not None else b 

925 jointKeys = aBest.keys() & bBest.keys() 

926 # If any common values are not equal, we know they are inconsistent. 

927 if any(aBest[k] != bBest[k] for k in jointKeys): 

928 return None 

929 # If the graphs are equal, we know the data IDs are. 

930 if a.graph == b.graph: 

931 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys)) 

932 # Result is still inconclusive. Try to expand a data ID containing 

933 # keys from both; that will fail if they are inconsistent. 

934 # First, if either input was already an ExpandedDataCoordinate, extract 

935 # its records so we don't have to query for them. 

936 records: NamedKeyDict[DimensionElement, Optional[DimensionRecord]] = NamedKeyDict() 

937 if isinstance(a, ExpandedDataCoordinate): 

938 records.update(a.records) 

939 if isinstance(b, ExpandedDataCoordinate): 

940 records.update(b.records) 

941 try: 

942 self.expandDataId({**a.byName(), **b.byName()}, graph=(a.graph | b.graph), records=records) 

943 except InconsistentDataIdError: 

944 return None 

945 # We know the answer is not `None`; time to figure out what it is. 

946 return ConsistentDataIds( 

947 contains=(a.graph >= b.graph), 

948 within=(a.graph <= b.graph), 

949 overlaps=bool(a.graph & b.graph), 

950 ) 

951 

952 def insertDimensionData(self, element: Union[DimensionElement, str], 

953 *data: Union[Mapping[str, Any], DimensionRecord], 

954 conform: bool = True) -> None: 

955 """Insert one or more dimension records into the database. 

956 

957 Parameters 

958 ---------- 

959 element : `DimensionElement` or `str` 

960 The `DimensionElement` or name thereof that identifies the table 

961 records will be inserted into. 

962 data : `dict` or `DimensionRecord` (variadic) 

963 One or more records to insert. 

964 conform : `bool`, optional 

965 If `False` (`True` is default) perform no checking or conversions, 

966 and assume that ``element`` is a `DimensionElement` instance and 

967 ``data`` is a one or more `DimensionRecord` instances of the 

968 appropriate subclass. 

969 """ 

970 if conform: 

971 if isinstance(element, str): 

972 element = self.dimensions[element] 

973 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row) 

974 for row in data] 

975 else: 

976 # Ignore typing since caller said to trust them with conform=False. 

977 records = data # type: ignore 

978 storage = self._dimensions[element] # type: ignore 

979 storage.insert(*records) 

980 

981 def syncDimensionData(self, element: Union[DimensionElement, str], 

982 row: Union[Mapping[str, Any], DimensionRecord], 

983 conform: bool = True) -> bool: 

984 """Synchronize the given dimension record with the database, inserting 

985 if it does not already exist and comparing values if it does. 

986 

987 Parameters 

988 ---------- 

989 element : `DimensionElement` or `str` 

990 The `DimensionElement` or name thereof that identifies the table 

991 records will be inserted into. 

992 row : `dict` or `DimensionRecord` 

993 The record to insert. 

994 conform : `bool`, optional 

995 If `False` (`True` is default) perform no checking or conversions, 

996 and assume that ``element`` is a `DimensionElement` instance and 

997 ``data`` is a one or more `DimensionRecord` instances of the 

998 appropriate subclass. 

999 

1000 Returns 

1001 ------- 

1002 inserted : `bool` 

1003 `True` if a new row was inserted, `False` otherwise. 

1004 

1005 Raises 

1006 ------ 

1007 ConflictingDefinitionError 

1008 Raised if the record exists in the database (according to primary 

1009 key lookup) but is inconsistent with the given one. 

1010 

1011 Notes 

1012 ----- 

1013 This method cannot be called within transactions, as it needs to be 

1014 able to perform its own transaction to be concurrent. 

1015 """ 

1016 if conform: 

1017 if isinstance(element, str): 

1018 element = self.dimensions[element] 

1019 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row) 

1020 else: 

1021 # Ignore typing since caller said to trust them with conform=False. 

1022 record = row # type: ignore 

1023 storage = self._dimensions[element] # type: ignore 

1024 return storage.sync(record) 

1025 

1026 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1027 ) -> Iterator[DatasetType]: 

1028 """Iterate over the dataset types whose names match an expression. 

1029 

1030 Parameters 

1031 ---------- 

1032 expression : `Any`, optional 

1033 An expression that fully or partially identifies the dataset types 

1034 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1035 `...` can be used to return all dataset types, and is the default. 

1036 See :ref:`daf_butler_dataset_type_expressions` for more 

1037 information. 

1038 components : `bool`, optional 

1039 If `True`, apply all expression patterns to component dataset type 

1040 names as well. If `False`, never apply patterns to components. 

1041 If `None` (default), apply patterns to components only if their 

1042 parent datasets were not matched by the expression. 

1043 Fully-specified component datasets (`str` or `DatasetType` 

1044 instances) are always included. 

1045 

1046 Yields 

1047 ------ 

1048 datasetType : `DatasetType` 

1049 A `DatasetType` instance whose name matches ``expression``. 

1050 """ 

1051 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1052 if wildcard is Ellipsis: 

1053 for datasetType in self._datasets: 

1054 if components or not datasetType.isComponent(): 

1055 yield datasetType 

1056 return 

1057 done: Set[str] = set() 

1058 for name in wildcard.strings: 

1059 storage = self._datasets.find(name) 

1060 if storage is not None: 

1061 done.add(storage.datasetType.name) 

1062 yield storage.datasetType 

1063 if wildcard.patterns: 

1064 # If components (the argument) is None, we'll save component 

1065 # dataset that we might want to match, but only if their parents 

1066 # didn't get included. 

1067 componentsForLater = [] 

1068 for datasetType in self._datasets: 

1069 if datasetType.name in done: 

1070 continue 

1071 parentName, componentName = datasetType.nameAndComponent() 

1072 if componentName is not None and not components: 

1073 if components is None and parentName not in done: 

1074 componentsForLater.append(datasetType) 

1075 continue 

1076 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1077 done.add(datasetType.name) 

1078 yield datasetType 

1079 # Go back and try to match saved components. 

1080 for datasetType in componentsForLater: 

1081 parentName, _ = datasetType.nameAndComponent() 

1082 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1083 yield datasetType 

1084 

1085 def queryCollections(self, expression: Any = ..., 

1086 datasetType: Optional[DatasetType] = None, 

1087 collectionType: Optional[CollectionType] = None, 

1088 flattenChains: bool = False, 

1089 includeChains: Optional[bool] = None) -> Iterator[str]: 

1090 """Iterate over the collections whose names match an expression. 

1091 

1092 Parameters 

1093 ---------- 

1094 expression : `Any`, optional 

1095 An expression that fully or partially identifies the collections 

1096 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1097 `...` can be used to return all collections, and is the default. 

1098 See :ref:`daf_butler_collection_expressions` for more 

1099 information. 

1100 datasetType : `DatasetType`, optional 

1101 If provided, only yield collections that should be searched for 

1102 this dataset type according to ``expression``. If this is 

1103 not provided, any dataset type restrictions in ``expression`` are 

1104 ignored. 

1105 collectionType : `CollectionType`, optional 

1106 If provided, only yield collections of this type. 

1107 flattenChains : `bool`, optional 

1108 If `True` (`False` is default), recursively yield the child 

1109 collections of matching `~CollectionType.CHAINED` collections. 

1110 includeChains : `bool`, optional 

1111 If `True`, yield records for matching `~CollectionType.CHAINED` 

1112 collections. Default is the opposite of ``flattenChains``: include 

1113 either CHAINED collections or their children, but not both. 

1114 

1115 Yields 

1116 ------ 

1117 collection : `str` 

1118 The name of a collection that matches ``expression``. 

1119 """ 

1120 query = CollectionQuery.fromExpression(expression) 

1121 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1122 flattenChains=flattenChains, includeChains=includeChains): 

1123 yield record.name 

1124 

1125 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1126 """Return a `QueryBuilder` instance capable of constructing and 

1127 managing more complex queries than those obtainable via `Registry` 

1128 interfaces. 

1129 

1130 This is an advanced interface; downstream code should prefer 

1131 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1132 are sufficient. 

1133 

1134 Parameters 

1135 ---------- 

1136 summary : `QuerySummary` 

1137 Object describing and categorizing the full set of dimensions that 

1138 will be included in the query. 

1139 

1140 Returns 

1141 ------- 

1142 builder : `QueryBuilder` 

1143 Object that can be used to construct and perform advanced queries. 

1144 """ 

1145 return QueryBuilder(summary=summary, 

1146 collections=self._collections, 

1147 dimensions=self._dimensions, 

1148 datasets=self._datasets) 

1149 

1150 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1151 dataId: Optional[DataId] = None, 

1152 datasets: Any = None, 

1153 collections: Any = None, 

1154 where: Optional[str] = None, 

1155 expand: bool = True, 

1156 components: Optional[bool] = None, 

1157 **kwargs: Any) -> Iterator[DataCoordinate]: 

1158 """Query for and iterate over data IDs matching user-provided criteria. 

1159 

1160 Parameters 

1161 ---------- 

1162 dimensions : `Dimension` or `str`, or iterable thereof 

1163 The dimensions of the data IDs to yield, as either `Dimension` 

1164 instances or `str`. Will be automatically expanded to a complete 

1165 `DimensionGraph`. 

1166 dataId : `dict` or `DataCoordinate`, optional 

1167 A data ID whose key-value pairs are used as equality constraints 

1168 in the query. 

1169 datasets : `Any`, optional 

1170 An expression that fully or partially identifies dataset types 

1171 that should constrain the yielded data IDs. For example, including 

1172 "raw" here would constrain the yielded ``instrument``, 

1173 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1174 those for which at least one "raw" dataset exists in 

1175 ``collections``. Allowed types include `DatasetType`, `str`, 

1176 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1177 expressions, `...` is not permitted - it doesn't make sense to 

1178 constrain data IDs on the existence of *all* datasets. 

1179 See :ref:`daf_butler_dataset_type_expressions` for more 

1180 information. 

1181 collections: `Any`, optional 

1182 An expression that fully or partially identifies the collections 

1183 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1184 thereof. `...` can be used to return all collections. Must be 

1185 provided if ``datasets`` is, and is ignored if it is not. See 

1186 :ref:`daf_butler_collection_expressions` for more information. 

1187 where : `str`, optional 

1188 A string expression similar to a SQL WHERE clause. May involve 

1189 any column of a dimension table or (as a shortcut for the primary 

1190 key column of a dimension table) dimension name. See 

1191 :ref:`daf_butler_dimension_expressions` for more information. 

1192 expand : `bool`, optional 

1193 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1194 minimal `DataCoordinate` base-class instances. 

1195 components : `bool`, optional 

1196 If `True`, apply all dataset expression patterns to component 

1197 dataset type names as well. If `False`, never apply patterns to 

1198 components. If `None` (default), apply patterns to components only 

1199 if their parent datasets were not matched by the expression. 

1200 Fully-specified component datasets (`str` or `DatasetType` 

1201 instances) are always included. 

1202 **kwargs 

1203 Additional keyword arguments are forwarded to 

1204 `DataCoordinate.standardize` when processing the ``dataId`` 

1205 argument (and may be used to provide a constraining data ID even 

1206 when the ``dataId`` argument is `None`). 

1207 

1208 Yields 

1209 ------ 

1210 dataId : `DataCoordinate` 

1211 Data IDs matching the given query parameters. Order is 

1212 unspecified. 

1213 """ 

1214 dimensions = iterable(dimensions) 

1215 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1216 standardizedDatasetTypes = set() 

1217 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1218 if datasets is not None: 

1219 if collections is None: 

1220 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1221 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1222 requestedDimensionNames.update(datasetType.dimensions.names) 

1223 # If any matched dataset type is a component, just operate on 

1224 # its parent instead, because Registry doesn't know anything 

1225 # about what components exist, and here (unlike queryDatasets) 

1226 # we don't care about returning them. 

1227 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1228 if componentName is not None: 

1229 datasetType = self.getDatasetType(parentDatasetTypeName) 

1230 standardizedDatasetTypes.add(datasetType) 

1231 # Preprocess collections expression in case the original included 

1232 # single-pass iterators (we'll want to use it multiple times 

1233 # below). 

1234 collections = CollectionQuery.fromExpression(collections) 

1235 

1236 summary = QuerySummary( 

1237 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1238 dataId=standardizedDataId, 

1239 expression=where, 

1240 ) 

1241 builder = self.makeQueryBuilder(summary) 

1242 for datasetType in standardizedDatasetTypes: 

1243 builder.joinDataset(datasetType, collections, isResult=False) 

1244 query = builder.finish() 

1245 predicate = query.predicate() 

1246 for row in self._db.query(query.sql): 

1247 if predicate(row): 

1248 result = query.extractDataId(row) 

1249 if expand: 

1250 yield self.expandDataId(result, records=standardizedDataId.records) 

1251 else: 

1252 yield result 

1253 

1254 def queryDatasets(self, datasetType: Any, *, 

1255 collections: Any, 

1256 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1257 dataId: Optional[DataId] = None, 

1258 where: Optional[str] = None, 

1259 deduplicate: bool = False, 

1260 expand: bool = True, 

1261 components: Optional[bool] = None, 

1262 **kwargs: Any) -> Iterator[DatasetRef]: 

1263 """Query for and iterate over dataset references matching user-provided 

1264 criteria. 

1265 

1266 Parameters 

1267 ---------- 

1268 datasetType 

1269 An expression that fully or partially identifies the dataset types 

1270 to be queried. Allowed types include `DatasetType`, `str`, 

1271 `re.Pattern`, and iterables thereof. The special value `...` can 

1272 be used to query all dataset types. See 

1273 :ref:`daf_butler_dataset_type_expressions` for more information. 

1274 collections 

1275 An expression that fully or partially identifies the collections 

1276 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1277 thereof. `...` can be used to return all collections. See 

1278 :ref:`daf_butler_collection_expressions` for more information. 

1279 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1280 Dimensions to include in the query (in addition to those used 

1281 to identify the queried dataset type(s)), either to constrain 

1282 the resulting datasets to those for which a matching dimension 

1283 exists, or to relate the dataset type's dimensions to dimensions 

1284 referenced by the ``dataId`` or ``where`` arguments. 

1285 dataId : `dict` or `DataCoordinate`, optional 

1286 A data ID whose key-value pairs are used as equality constraints 

1287 in the query. 

1288 where : `str`, optional 

1289 A string expression similar to a SQL WHERE clause. May involve 

1290 any column of a dimension table or (as a shortcut for the primary 

1291 key column of a dimension table) dimension name. See 

1292 :ref:`daf_butler_dimension_expressions` for more information. 

1293 deduplicate : `bool`, optional 

1294 If `True` (`False` is default), for each result data ID, only 

1295 yield one `DatasetRef` of each `DatasetType`, from the first 

1296 collection in which a dataset of that dataset type appears 

1297 (according to the order of ``collections`` passed in). If `True`, 

1298 ``collections`` must not contain regular expressions and may not 

1299 be `...`. 

1300 expand : `bool`, optional 

1301 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1302 minimal `DataCoordinate` base-class instances. 

1303 components : `bool`, optional 

1304 If `True`, apply all dataset expression patterns to component 

1305 dataset type names as well. If `False`, never apply patterns to 

1306 components. If `None` (default), apply patterns to components only 

1307 if their parent datasets were not matched by the expression. 

1308 Fully-specified component datasets (`str` or `DatasetType` 

1309 instances) are always included. 

1310 **kwargs 

1311 Additional keyword arguments are forwarded to 

1312 `DataCoordinate.standardize` when processing the ``dataId`` 

1313 argument (and may be used to provide a constraining data ID even 

1314 when the ``dataId`` argument is `None`). 

1315 

1316 Yields 

1317 ------ 

1318 ref : `DatasetRef` 

1319 Dataset references matching the given query criteria. These 

1320 are grouped by `DatasetType` if the query evaluates to multiple 

1321 dataset types, but order is otherwise unspecified. 

1322 

1323 Raises 

1324 ------ 

1325 TypeError 

1326 Raised when the arguments are incompatible, such as when a 

1327 collection wildcard is passed when ``deduplicate`` is `True`. 

1328 

1329 Notes 

1330 ----- 

1331 When multiple dataset types are queried in a single call, the 

1332 results of this operation are equivalent to querying for each dataset 

1333 type separately in turn, and no information about the relationships 

1334 between datasets of different types is included. In contexts where 

1335 that kind of information is important, the recommended pattern is to 

1336 use `queryDimensions` to first obtain data IDs (possibly with the 

1337 desired dataset types and collections passed as constraints to the 

1338 query), and then use multiple (generally much simpler) calls to 

1339 `queryDatasets` with the returned data IDs passed as constraints. 

1340 """ 

1341 # Standardize the collections expression. 

1342 if deduplicate: 

1343 collections = CollectionSearch.fromExpression(collections) 

1344 else: 

1345 collections = CollectionQuery.fromExpression(collections) 

1346 # Standardize and expand the data ID provided as a constraint. 

1347 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1348 

1349 # We can only query directly if given a non-component DatasetType 

1350 # instance. If we were given an expression or str or a component 

1351 # DatasetType instance, we'll populate this dict, recurse, and return. 

1352 # If we already have a non-component DatasetType, it will remain None 

1353 # and we'll run the query directly. 

1354 composition: Optional[ 

1355 Dict[ 

1356 DatasetType, # parent dataset type 

1357 List[Optional[str]] # component name, or None for parent 

1358 ] 

1359 ] = None 

1360 if not isinstance(datasetType, DatasetType): 

1361 # We were given a dataset type expression (which may be as simple 

1362 # as a str). Loop over all matching datasets, delegating handling 

1363 # of the `components` argument to queryDatasetTypes, as we populate 

1364 # the composition dict. 

1365 composition = defaultdict(list) 

1366 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1367 parentName, componentName = trueDatasetType.nameAndComponent() 

1368 if componentName is not None: 

1369 parentDatasetType = self.getDatasetType(parentName) 

1370 composition.setdefault(parentDatasetType, []).append(componentName) 

1371 else: 

1372 composition.setdefault(trueDatasetType, []).append(None) 

1373 elif datasetType.isComponent(): 

1374 # We were given a true DatasetType instance, but it's a component. 

1375 # the composition dict will have exactly one item. 

1376 parentName, componentName = datasetType.nameAndComponent() 

1377 parentDatasetType = self.getDatasetType(parentName) 

1378 composition = {parentDatasetType: [componentName]} 

1379 if composition is not None: 

1380 # We need to recurse. Do that once for each parent dataset type. 

1381 for parentDatasetType, componentNames in composition.items(): 

1382 for parentRef in self.queryDatasets(parentDatasetType, collections=collections, 

1383 dimensions=dimensions, dataId=standardizedDataId, 

1384 where=where, deduplicate=deduplicate): 

1385 # Loop over components, yielding one for each one for each 

1386 # one requested. 

1387 for componentName in componentNames: 

1388 if componentName is None: 

1389 yield parentRef 

1390 else: 

1391 yield parentRef.makeComponentRef(componentName) 

1392 return 

1393 # If we get here, there's no need to recurse (or we are already 

1394 # recursing; there can only ever be one level of recursion). 

1395 

1396 # The full set of dimensions in the query is the combination of those 

1397 # needed for the DatasetType and those explicitly requested, if any. 

1398 requestedDimensionNames = set(datasetType.dimensions.names) 

1399 if dimensions is not None: 

1400 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1401 # Construct the summary structure needed to construct a QueryBuilder. 

1402 summary = QuerySummary( 

1403 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1404 dataId=standardizedDataId, 

1405 expression=where, 

1406 ) 

1407 builder = self.makeQueryBuilder(summary) 

1408 # Add the dataset subquery to the query, telling the QueryBuilder to 

1409 # include the rank of the selected collection in the results only if we 

1410 # need to deduplicate. Note that if any of the collections are 

1411 # actually wildcard expressions, and we've asked for deduplication, 

1412 # this will raise TypeError for us. 

1413 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1414 return 

1415 query = builder.finish() 

1416 predicate = query.predicate() 

1417 if not deduplicate: 

1418 # No need to de-duplicate across collections. 

1419 for row in self._db.query(query.sql): 

1420 if predicate(row): 

1421 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1422 if expand: 

1423 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1424 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1425 else: 

1426 # For each data ID, yield only the DatasetRef with the lowest 

1427 # collection rank. 

1428 bestRefs = {} 

1429 bestRanks: Dict[DataCoordinate, int] = {} 

1430 for row in self._db.query(query.sql): 

1431 if predicate(row): 

1432 ref, rank = query.extractDatasetRef(row, datasetType) 

1433 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1434 assert rank is not None 

1435 if rank < bestRank: 

1436 bestRefs[ref.dataId] = ref 

1437 bestRanks[ref.dataId] = rank 

1438 # If caller requested expanded data IDs, we defer that until here 

1439 # so we do as little expansion as possible. 

1440 if expand: 

1441 for ref in bestRefs.values(): 

1442 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1443 yield ref.expanded(dataId) 

1444 else: 

1445 yield from bestRefs.values() 

1446 

1447 storageClasses: StorageClassFactory 

1448 """All storage classes known to the registry (`StorageClassFactory`). 

1449 """