Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31import sys 

32from typing import ( 

33 Any, 

34 Dict, 

35 Iterable, 

36 Iterator, 

37 List, 

38 Mapping, 

39 Optional, 

40 Set, 

41 Type, 

42 TYPE_CHECKING, 

43 Union, 

44) 

45 

46import sqlalchemy 

47 

48from ..core import ( 

49 Config, 

50 DataCoordinate, 

51 DataCoordinateIterable, 

52 DataId, 

53 DatasetRef, 

54 DatasetType, 

55 ddl, 

56 Dimension, 

57 DimensionElement, 

58 DimensionGraph, 

59 DimensionRecord, 

60 DimensionUniverse, 

61 NamedKeyMapping, 

62 NameLookupMapping, 

63 StorageClassFactory, 

64) 

65from ..core.utils import doImport, iterable, transactional 

66from ._config import RegistryConfig 

67from .queries import ( 

68 QueryBuilder, 

69 QuerySummary, 

70) 

71from ._collectionType import CollectionType 

72from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

73from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

74from .interfaces import ChainedCollectionRecord, RunRecord 

75from .versions import ButlerVersionsManager, DigestMismatchError 

76 

77if TYPE_CHECKING: 77 ↛ 78line 77 didn't jump to line 78, because the condition on line 77 was never true

78 from ..butlerConfig import ButlerConfig 

79 from .interfaces import ( 

80 ButlerAttributeManager, 

81 CollectionManager, 

82 Database, 

83 OpaqueTableStorageManager, 

84 DimensionRecordStorageManager, 

85 DatasetRecordStorageManager, 

86 DatastoreRegistryBridgeManager, 

87 ) 

88 

89 

90_LOG = logging.getLogger(__name__) 

91 

92 

93class Registry: 

94 """Registry interface. 

95 

96 Parameters 

97 ---------- 

98 database : `Database` 

99 Database instance to store Registry. 

100 universe : `DimensionUniverse` 

101 Full set of dimensions for Registry. 

102 attributes : `type` 

103 Manager class implementing `ButlerAttributeManager`. 

104 opaque : `type` 

105 Manager class implementing `OpaqueTableStorageManager`. 

106 dimensions : `type` 

107 Manager class implementing `DimensionRecordStorageManager`. 

108 collections : `type` 

109 Manager class implementing `CollectionManager`. 

110 datasets : `type` 

111 Manager class implementing `DatasetRecordStorageManager`. 

112 datastoreBridges : `type` 

113 Manager class implementing `DatastoreRegistryBridgeManager`. 

114 writeable : `bool`, optional 

115 If True then Registry will support write operations. 

116 create : `bool`, optional 

117 If True then database schema will be initialized, it must be empty 

118 before instantiating Registry. 

119 """ 

120 

121 defaultConfigFile: Optional[str] = None 

122 """Path to configuration defaults. Accessed within the ``config`` resource 

123 or relative to a search path. Can be None if no defaults specified. 

124 """ 

125 

126 @classmethod 

127 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

128 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

129 """Create `Registry` subclass instance from `config`. 

130 

131 Uses ``registry.cls`` from `config` to determine which subclass to 

132 instantiate. 

133 

134 Parameters 

135 ---------- 

136 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

137 Registry configuration 

138 create : `bool`, optional 

139 Assume empty Registry and create a new one. 

140 butlerRoot : `str`, optional 

141 Path to the repository root this `Registry` will manage. 

142 writeable : `bool`, optional 

143 If `True` (default) create a read-write connection to the database. 

144 

145 Returns 

146 ------- 

147 registry : `Registry` (subclass) 

148 A new `Registry` subclass instance. 

149 """ 

150 if not isinstance(config, RegistryConfig): 

151 if isinstance(config, str) or isinstance(config, Config): 

152 config = RegistryConfig(config) 

153 else: 

154 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

155 config.replaceRoot(butlerRoot) 

156 DatabaseClass = config.getDatabaseClass() 

157 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

158 namespace=config.get("namespace"), writeable=writeable) 

159 universe = DimensionUniverse(config) 

160 attributes = doImport(config["managers", "attributes"]) 

161 opaque = doImport(config["managers", "opaque"]) 

162 dimensions = doImport(config["managers", "dimensions"]) 

163 collections = doImport(config["managers", "collections"]) 

164 datasets = doImport(config["managers", "datasets"]) 

165 datastoreBridges = doImport(config["managers", "datastores"]) 

166 

167 return cls(database, universe, dimensions=dimensions, attributes=attributes, opaque=opaque, 

168 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

169 writeable=writeable, create=create) 

170 

171 def __init__(self, database: Database, universe: DimensionUniverse, *, 

172 attributes: Type[ButlerAttributeManager], 

173 opaque: Type[OpaqueTableStorageManager], 

174 dimensions: Type[DimensionRecordStorageManager], 

175 collections: Type[CollectionManager], 

176 datasets: Type[DatasetRecordStorageManager], 

177 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

178 writeable: bool = True, 

179 create: bool = False): 

180 self._db = database 

181 self.storageClasses = StorageClassFactory() 

182 with self._db.declareStaticTables(create=create) as context: 

183 self._attributes = attributes.initialize(self._db, context) 

184 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

185 self._collections = collections.initialize(self._db, context) 

186 self._datasets = datasets.initialize(self._db, context, 

187 collections=self._collections, 

188 universe=self.dimensions) 

189 self._opaque = opaque.initialize(self._db, context) 

190 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

191 opaque=self._opaque, 

192 datasets=datasets, 

193 universe=self.dimensions) 

194 versions = ButlerVersionsManager( 

195 self._attributes, 

196 dict( 

197 attributes=self._attributes, 

198 opaque=self._opaque, 

199 dimensions=self._dimensions, 

200 collections=self._collections, 

201 datasets=self._datasets, 

202 datastores=self._datastoreBridges, 

203 ) 

204 ) 

205 # store managers and their versions in attributes table 

206 context.addInitializer(lambda db: versions.storeManagersConfig()) 

207 context.addInitializer(lambda db: versions.storeManagersVersions()) 

208 

209 if not create: 

210 # verify that configured versions are compatible with schema 

211 versions.checkManagersConfig() 

212 versions.checkManagersVersions(writeable) 

213 try: 

214 versions.checkManagersDigests() 

215 except DigestMismatchError as exc: 

216 # potentially digest mismatch is a serious error but during 

217 # development it could be benign, treat this as warning for 

218 # now. 

219 _LOG.warning(f"Registry schema digest mismatch: {exc}") 

220 

221 self._collections.refresh() 

222 self._datasets.refresh(universe=self._dimensions.universe) 

223 

224 def __str__(self) -> str: 

225 return str(self._db) 

226 

227 def __repr__(self) -> str: 

228 return f"Registry({self._db!r}, {self.dimensions!r})" 

229 

230 def isWriteable(self) -> bool: 

231 """Return `True` if this registry allows write operations, and `False` 

232 otherwise. 

233 """ 

234 return self._db.isWriteable() 

235 

236 @property 

237 def dimensions(self) -> DimensionUniverse: 

238 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

239 """ 

240 return self._dimensions.universe 

241 

242 @contextlib.contextmanager 

243 def transaction(self) -> Iterator[None]: 

244 """Return a context manager that represents a transaction. 

245 """ 

246 # TODO make savepoint=False the default. 

247 try: 

248 with self._db.transaction(): 

249 yield 

250 except BaseException: 

251 # TODO: this clears the caches sometimes when we wouldn't actually 

252 # need to. Can we avoid that? 

253 self._dimensions.clearCaches() 

254 raise 

255 

256 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

257 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

258 other data repository client. 

259 

260 Opaque table records can be added via `insertOpaqueData`, retrieved via 

261 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

262 

263 Parameters 

264 ---------- 

265 tableName : `str` 

266 Logical name of the opaque table. This may differ from the 

267 actual name used in the database by a prefix and/or suffix. 

268 spec : `ddl.TableSpec` 

269 Specification for the table to be added. 

270 """ 

271 self._opaque.register(tableName, spec) 

272 

273 @transactional 

274 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

275 """Insert records into an opaque table. 

276 

277 Parameters 

278 ---------- 

279 tableName : `str` 

280 Logical name of the opaque table. Must match the name used in a 

281 previous call to `registerOpaqueTable`. 

282 data 

283 Each additional positional argument is a dictionary that represents 

284 a single row to be added. 

285 """ 

286 self._opaque[tableName].insert(*data) 

287 

288 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

289 """Retrieve records from an opaque table. 

290 

291 Parameters 

292 ---------- 

293 tableName : `str` 

294 Logical name of the opaque table. Must match the name used in a 

295 previous call to `registerOpaqueTable`. 

296 where 

297 Additional keyword arguments are interpreted as equality 

298 constraints that restrict the returned rows (combined with AND); 

299 keyword arguments are column names and values are the values they 

300 must have. 

301 

302 Yields 

303 ------ 

304 row : `dict` 

305 A dictionary representing a single result row. 

306 """ 

307 yield from self._opaque[tableName].fetch(**where) 

308 

309 @transactional 

310 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

311 """Remove records from an opaque table. 

312 

313 Parameters 

314 ---------- 

315 tableName : `str` 

316 Logical name of the opaque table. Must match the name used in a 

317 previous call to `registerOpaqueTable`. 

318 where 

319 Additional keyword arguments are interpreted as equality 

320 constraints that restrict the deleted rows (combined with AND); 

321 keyword arguments are column names and values are the values they 

322 must have. 

323 """ 

324 self._opaque[tableName].delete(**where) 

325 

326 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None: 

327 """Add a new collection if one with the given name does not exist. 

328 

329 Parameters 

330 ---------- 

331 name : `str` 

332 The name of the collection to create. 

333 type : `CollectionType` 

334 Enum value indicating the type of collection to create. 

335 

336 Notes 

337 ----- 

338 This method cannot be called within transactions, as it needs to be 

339 able to perform its own transaction to be concurrent. 

340 """ 

341 self._collections.register(name, type) 

342 

343 def getCollectionType(self, name: str) -> CollectionType: 

344 """Return an enumeration value indicating the type of the given 

345 collection. 

346 

347 Parameters 

348 ---------- 

349 name : `str` 

350 The name of the collection. 

351 

352 Returns 

353 ------- 

354 type : `CollectionType` 

355 Enum value indicating the type of this collection. 

356 

357 Raises 

358 ------ 

359 MissingCollectionError 

360 Raised if no collection with the given name exists. 

361 """ 

362 return self._collections.find(name).type 

363 

364 def registerRun(self, name: str) -> None: 

365 """Add a new run if one with the given name does not exist. 

366 

367 Parameters 

368 ---------- 

369 name : `str` 

370 The name of the run to create. 

371 

372 Notes 

373 ----- 

374 This method cannot be called within transactions, as it needs to be 

375 able to perform its own transaction to be concurrent. 

376 """ 

377 self._collections.register(name, CollectionType.RUN) 

378 

379 @transactional 

380 def removeCollection(self, name: str) -> None: 

381 """Completely remove the given collection. 

382 

383 Parameters 

384 ---------- 

385 name : `str` 

386 The name of the collection to remove. 

387 

388 Raises 

389 ------ 

390 MissingCollectionError 

391 Raised if no collection with the given name exists. 

392 

393 Notes 

394 ----- 

395 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

396 in it are also fully removed. This requires that those datasets be 

397 removed (or at least trashed) from any datastores that hold them first. 

398 

399 A collection may not be deleted as long as it is referenced by a 

400 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

401 be deleted or redefined first. 

402 """ 

403 self._collections.remove(name) 

404 

405 def getCollectionChain(self, parent: str) -> CollectionSearch: 

406 """Return the child collections in a `~CollectionType.CHAINED` 

407 collection. 

408 

409 Parameters 

410 ---------- 

411 parent : `str` 

412 Name of the chained collection. Must have already been added via 

413 a call to `Registry.registerCollection`. 

414 

415 Returns 

416 ------- 

417 children : `CollectionSearch` 

418 An object that defines the search path of the collection. 

419 See :ref:`daf_butler_collection_expressions` for more information. 

420 

421 Raises 

422 ------ 

423 MissingCollectionError 

424 Raised if ``parent`` does not exist in the `Registry`. 

425 TypeError 

426 Raised if ``parent`` does not correspond to a 

427 `~CollectionType.CHAINED` collection. 

428 """ 

429 record = self._collections.find(parent) 

430 if record.type is not CollectionType.CHAINED: 

431 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

432 assert isinstance(record, ChainedCollectionRecord) 

433 return record.children 

434 

435 @transactional 

436 def setCollectionChain(self, parent: str, children: Any) -> None: 

437 """Define or redefine a `~CollectionType.CHAINED` collection. 

438 

439 Parameters 

440 ---------- 

441 parent : `str` 

442 Name of the chained collection. Must have already been added via 

443 a call to `Registry.registerCollection`. 

444 children : `Any` 

445 An expression defining an ordered search of child collections, 

446 generally an iterable of `str`. Restrictions on the dataset types 

447 to be searched can also be included, by passing mapping or an 

448 iterable containing tuples; see 

449 :ref:`daf_butler_collection_expressions` for more information. 

450 

451 Raises 

452 ------ 

453 MissingCollectionError 

454 Raised when any of the given collections do not exist in the 

455 `Registry`. 

456 TypeError 

457 Raised if ``parent`` does not correspond to a 

458 `~CollectionType.CHAINED` collection. 

459 ValueError 

460 Raised if the given collections contains a cycle. 

461 """ 

462 record = self._collections.find(parent) 

463 if record.type is not CollectionType.CHAINED: 

464 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

465 assert isinstance(record, ChainedCollectionRecord) 

466 children = CollectionSearch.fromExpression(children) 

467 if children != record.children: 

468 record.update(self._collections, children) 

469 

470 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

471 """ 

472 Add a new `DatasetType` to the Registry. 

473 

474 It is not an error to register the same `DatasetType` twice. 

475 

476 Parameters 

477 ---------- 

478 datasetType : `DatasetType` 

479 The `DatasetType` to be added. 

480 

481 Returns 

482 ------- 

483 inserted : `bool` 

484 `True` if ``datasetType`` was inserted, `False` if an identical 

485 existing `DatsetType` was found. Note that in either case the 

486 DatasetType is guaranteed to be defined in the Registry 

487 consistently with the given definition. 

488 

489 Raises 

490 ------ 

491 ValueError 

492 Raised if the dimensions or storage class are invalid. 

493 ConflictingDefinitionError 

494 Raised if this DatasetType is already registered with a different 

495 definition. 

496 

497 Notes 

498 ----- 

499 This method cannot be called within transactions, as it needs to be 

500 able to perform its own transaction to be concurrent. 

501 """ 

502 _, inserted = self._datasets.register(datasetType) 

503 return inserted 

504 

505 def getDatasetType(self, name: str) -> DatasetType: 

506 """Get the `DatasetType`. 

507 

508 Parameters 

509 ---------- 

510 name : `str` 

511 Name of the type. 

512 

513 Returns 

514 ------- 

515 type : `DatasetType` 

516 The `DatasetType` associated with the given name. 

517 

518 Raises 

519 ------ 

520 KeyError 

521 Requested named DatasetType could not be found in registry. 

522 """ 

523 storage = self._datasets.find(name) 

524 if storage is None: 

525 raise KeyError(f"DatasetType '{name}' could not be found.") 

526 return storage.datasetType 

527 

528 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

529 collections: Any, **kwargs: Any) -> Optional[DatasetRef]: 

530 """Find a dataset given its `DatasetType` and data ID. 

531 

532 This can be used to obtain a `DatasetRef` that permits the dataset to 

533 be read from a `Datastore`. If the dataset is a component and can not 

534 be found using the provided dataset type, a dataset ref for the parent 

535 will be returned instead but with the correct dataset type. 

536 

537 Parameters 

538 ---------- 

539 datasetType : `DatasetType` or `str` 

540 A `DatasetType` or the name of one. 

541 dataId : `dict` or `DataCoordinate`, optional 

542 A `dict`-like object containing the `Dimension` links that identify 

543 the dataset within a collection. 

544 collections 

545 An expression that fully or partially identifies the collections 

546 to search for the dataset, such as a `str`, `re.Pattern`, or 

547 iterable thereof. `...` can be used to return all collections. 

548 See :ref:`daf_butler_collection_expressions` for more information. 

549 **kwargs 

550 Additional keyword arguments passed to 

551 `DataCoordinate.standardize` to convert ``dataId`` to a true 

552 `DataCoordinate` or augment an existing one. 

553 

554 Returns 

555 ------- 

556 ref : `DatasetRef` 

557 A reference to the dataset, or `None` if no matching Dataset 

558 was found. 

559 

560 Raises 

561 ------ 

562 LookupError 

563 Raised if one or more data ID keys are missing or the dataset type 

564 does not exist. 

565 MissingCollectionError 

566 Raised if any of ``collections`` does not exist in the registry. 

567 """ 

568 if isinstance(datasetType, DatasetType): 

569 storage = self._datasets.find(datasetType.name) 

570 if storage is None: 

571 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

572 else: 

573 storage = self._datasets.find(datasetType) 

574 if storage is None: 

575 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

576 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

577 universe=self.dimensions, **kwargs) 

578 collections = CollectionSearch.fromExpression(collections) 

579 for collectionRecord in collections.iter(self._collections, datasetType=storage.datasetType): 

580 result = storage.find(collectionRecord, dataId) 

581 if result is not None: 

582 return result 

583 

584 return None 

585 

586 @transactional 

587 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

588 run: str) -> List[DatasetRef]: 

589 """Insert one or more datasets into the `Registry` 

590 

591 This always adds new datasets; to associate existing datasets with 

592 a new collection, use ``associate``. 

593 

594 Parameters 

595 ---------- 

596 datasetType : `DatasetType` or `str` 

597 A `DatasetType` or the name of one. 

598 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

599 Dimension-based identifiers for the new datasets. 

600 run : `str` 

601 The name of the run that produced the datasets. 

602 

603 Returns 

604 ------- 

605 refs : `list` of `DatasetRef` 

606 Resolved `DatasetRef` instances for all given data IDs (in the same 

607 order). 

608 

609 Raises 

610 ------ 

611 ConflictingDefinitionError 

612 If a dataset with the same dataset type and data ID as one of those 

613 given already exists in ``run``. 

614 MissingCollectionError 

615 Raised if ``run`` does not exist in the registry. 

616 """ 

617 if isinstance(datasetType, DatasetType): 

618 storage = self._datasets.find(datasetType.name) 

619 if storage is None: 

620 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

621 else: 

622 storage = self._datasets.find(datasetType) 

623 if storage is None: 

624 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

625 runRecord = self._collections.find(run) 

626 if runRecord.type is not CollectionType.RUN: 

627 raise TypeError("Given collection is of type {runRecord.type.name}; RUN collection required.") 

628 assert isinstance(runRecord, RunRecord) 

629 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

630 for dataId in dataIds] 

631 try: 

632 refs = list(storage.insert(runRecord, expandedDataIds)) 

633 except sqlalchemy.exc.IntegrityError as err: 

634 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

635 f"one or more datasets of type {storage.datasetType} into " 

636 f"collection '{run}'. " 

637 f"This probably means a dataset with the same data ID " 

638 f"and dataset type already exists, but it may also mean a " 

639 f"dimension row is missing.") from err 

640 return refs 

641 

642 def getDataset(self, id: int) -> Optional[DatasetRef]: 

643 """Retrieve a Dataset entry. 

644 

645 Parameters 

646 ---------- 

647 id : `int` 

648 The unique identifier for the dataset. 

649 

650 Returns 

651 ------- 

652 ref : `DatasetRef` or `None` 

653 A ref to the Dataset, or `None` if no matching Dataset 

654 was found. 

655 """ 

656 ref = self._datasets.getDatasetRef(id, universe=self.dimensions) 

657 if ref is None: 

658 return None 

659 return ref 

660 

661 @transactional 

662 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

663 """Remove datasets from the Registry. 

664 

665 The datasets will be removed unconditionally from all collections, and 

666 any `Quantum` that consumed this dataset will instead be marked with 

667 having a NULL input. `Datastore` records will *not* be deleted; the 

668 caller is responsible for ensuring that the dataset has already been 

669 removed from all Datastores. 

670 

671 Parameters 

672 ---------- 

673 refs : `Iterable` of `DatasetRef` 

674 References to the datasets to be removed. Must include a valid 

675 ``id`` attribute, and should be considered invalidated upon return. 

676 

677 Raises 

678 ------ 

679 AmbiguousDatasetError 

680 Raised if any ``ref.id`` is `None`. 

681 OrphanedRecordError 

682 Raised if any dataset is still present in any `Datastore`. 

683 """ 

684 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

685 storage = self._datasets.find(datasetType.name) 

686 assert storage is not None 

687 try: 

688 storage.delete(refsForType) 

689 except sqlalchemy.exc.IntegrityError as err: 

690 raise OrphanedRecordError("One or more datasets is still " 

691 "present in one or more Datastores.") from err 

692 

693 @transactional 

694 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

695 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

696 

697 If a DatasetRef with the same exact integer ID is already in a 

698 collection nothing is changed. If a `DatasetRef` with the same 

699 `DatasetType` and data ID but with different integer ID 

700 exists in the collection, `ConflictingDefinitionError` is raised. 

701 

702 Parameters 

703 ---------- 

704 collection : `str` 

705 Indicates the collection the datasets should be associated with. 

706 refs : `Iterable` [ `DatasetRef` ] 

707 An iterable of resolved `DatasetRef` instances that already exist 

708 in this `Registry`. 

709 

710 Raises 

711 ------ 

712 ConflictingDefinitionError 

713 If a Dataset with the given `DatasetRef` already exists in the 

714 given collection. 

715 AmbiguousDatasetError 

716 Raised if ``any(ref.id is None for ref in refs)``. 

717 MissingCollectionError 

718 Raised if ``collection`` does not exist in the registry. 

719 TypeError 

720 Raise adding new datasets to the given ``collection`` is not 

721 allowed. 

722 """ 

723 collectionRecord = self._collections.find(collection) 

724 if collectionRecord.type is not CollectionType.TAGGED: 

725 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

726 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

727 storage = self._datasets.find(datasetType.name) 

728 assert storage is not None 

729 try: 

730 storage.associate(collectionRecord, refsForType) 

731 except sqlalchemy.exc.IntegrityError as err: 

732 raise ConflictingDefinitionError( 

733 f"Constraint violation while associating dataset of type {datasetType.name} with " 

734 f"collection {collection}. This probably means that one or more datasets with the same " 

735 f"dataset type and data ID already exist in the collection, but it may also indicate " 

736 f"that the datasets do not exist." 

737 ) from err 

738 

739 @transactional 

740 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

741 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

742 

743 ``collection`` and ``ref`` combinations that are not currently 

744 associated are silently ignored. 

745 

746 Parameters 

747 ---------- 

748 collection : `str` 

749 The collection the datasets should no longer be associated with. 

750 refs : `Iterable` [ `DatasetRef` ] 

751 An iterable of resolved `DatasetRef` instances that already exist 

752 in this `Registry`. 

753 

754 Raises 

755 ------ 

756 AmbiguousDatasetError 

757 Raised if any of the given dataset references is unresolved. 

758 MissingCollectionError 

759 Raised if ``collection`` does not exist in the registry. 

760 TypeError 

761 Raise adding new datasets to the given ``collection`` is not 

762 allowed. 

763 """ 

764 collectionRecord = self._collections.find(collection) 

765 if collectionRecord.type is not CollectionType.TAGGED: 

766 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

767 "expected TAGGED.") 

768 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

769 storage = self._datasets.find(datasetType.name) 

770 assert storage is not None 

771 storage.disassociate(collectionRecord, refsForType) 

772 

773 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

774 """Return an object that allows a new `Datastore` instance to 

775 communicate with this `Registry`. 

776 

777 Returns 

778 ------- 

779 manager : `DatastoreRegistryBridgeManager` 

780 Object that mediates communication between this `Registry` and its 

781 associated datastores. 

782 """ 

783 return self._datastoreBridges 

784 

785 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

786 """Retrieve datastore locations for a given dataset. 

787 

788 Parameters 

789 ---------- 

790 ref : `DatasetRef` 

791 A reference to the dataset for which to retrieve storage 

792 information. 

793 

794 Returns 

795 ------- 

796 datastores : `Iterable` [ `str` ] 

797 All the matching datastores holding this dataset. 

798 

799 Raises 

800 ------ 

801 AmbiguousDatasetError 

802 Raised if ``ref.id`` is `None`. 

803 """ 

804 return self._datastoreBridges.findDatastores(ref) 

805 

806 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

807 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

808 **kwargs: Any) -> DataCoordinate: 

809 """Expand a dimension-based data ID to include additional information. 

810 

811 Parameters 

812 ---------- 

813 dataId : `DataCoordinate` or `dict`, optional 

814 Data ID to be expanded; augmented and overridden by ``kwds``. 

815 graph : `DimensionGraph`, optional 

816 Set of dimensions for the expanded ID. If `None`, the dimensions 

817 will be inferred from the keys of ``dataId`` and ``kwds``. 

818 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

819 are silently ignored, providing a way to extract and expand a 

820 subset of a data ID. 

821 records : `Mapping` [`str`, `DimensionRecord`], optional 

822 Dimension record data to use before querying the database for that 

823 data, keyed by element name. 

824 **kwargs 

825 Additional keywords are treated like additional key-value pairs for 

826 ``dataId``, extending and overriding 

827 

828 Returns 

829 ------- 

830 expanded : `DataCoordinate` 

831 A data ID that includes full metadata for all of the dimensions it 

832 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

833 ``expanded.hasFull()`` both return `True`. 

834 """ 

835 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

836 if standardized.hasRecords(): 

837 return standardized 

838 if records is None: 

839 records = {} 

840 elif isinstance(records, NamedKeyMapping): 

841 records = records.byName() 

842 else: 

843 records = dict(records) 

844 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

845 records.update(dataId.records.byName()) 

846 keys = standardized.byName() 

847 for element in standardized.graph.primaryKeyTraversalOrder: 

848 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

849 if record is ...: 

850 if isinstance(element, Dimension) and keys.get(element.name) is None: 

851 if element in standardized.graph.required: 

852 raise LookupError( 

853 f"No value or null value for required dimension {element.name}." 

854 ) 

855 keys[element.name] = None 

856 record = None 

857 else: 

858 storage = self._dimensions[element] 

859 dataIdSet = DataCoordinateIterable.fromScalar( 

860 DataCoordinate.standardize(keys, graph=element.graph) 

861 ) 

862 fetched = tuple(storage.fetch(dataIdSet)) 

863 try: 

864 (record,) = fetched 

865 except ValueError: 

866 record = None 

867 records[element.name] = record 

868 if record is not None: 

869 for d in element.implied: 

870 value = getattr(record, d.name) 

871 if keys.setdefault(d.name, value) != value: 

872 raise InconsistentDataIdError( 

873 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

874 f"but {element.name} implies {d.name}={value!r}." 

875 ) 

876 else: 

877 if element in standardized.graph.required: 

878 raise LookupError( 

879 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

880 ) 

881 if element.alwaysJoin: 

882 raise InconsistentDataIdError( 

883 f"Could not fetch record for element {element.name} via keys {keys}, ", 

884 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

885 "related." 

886 ) 

887 for d in element.implied: 

888 keys.setdefault(d.name, None) 

889 records.setdefault(d.name, None) 

890 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

891 

892 def insertDimensionData(self, element: Union[DimensionElement, str], 

893 *data: Union[Mapping[str, Any], DimensionRecord], 

894 conform: bool = True) -> None: 

895 """Insert one or more dimension records into the database. 

896 

897 Parameters 

898 ---------- 

899 element : `DimensionElement` or `str` 

900 The `DimensionElement` or name thereof that identifies the table 

901 records will be inserted into. 

902 data : `dict` or `DimensionRecord` (variadic) 

903 One or more records to insert. 

904 conform : `bool`, optional 

905 If `False` (`True` is default) perform no checking or conversions, 

906 and assume that ``element`` is a `DimensionElement` instance and 

907 ``data`` is a one or more `DimensionRecord` instances of the 

908 appropriate subclass. 

909 """ 

910 if conform: 

911 if isinstance(element, str): 

912 element = self.dimensions[element] 

913 records = [row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row) 

914 for row in data] 

915 else: 

916 # Ignore typing since caller said to trust them with conform=False. 

917 records = data # type: ignore 

918 storage = self._dimensions[element] # type: ignore 

919 storage.insert(*records) 

920 

921 def syncDimensionData(self, element: Union[DimensionElement, str], 

922 row: Union[Mapping[str, Any], DimensionRecord], 

923 conform: bool = True) -> bool: 

924 """Synchronize the given dimension record with the database, inserting 

925 if it does not already exist and comparing values if it does. 

926 

927 Parameters 

928 ---------- 

929 element : `DimensionElement` or `str` 

930 The `DimensionElement` or name thereof that identifies the table 

931 records will be inserted into. 

932 row : `dict` or `DimensionRecord` 

933 The record to insert. 

934 conform : `bool`, optional 

935 If `False` (`True` is default) perform no checking or conversions, 

936 and assume that ``element`` is a `DimensionElement` instance and 

937 ``data`` is a one or more `DimensionRecord` instances of the 

938 appropriate subclass. 

939 

940 Returns 

941 ------- 

942 inserted : `bool` 

943 `True` if a new row was inserted, `False` otherwise. 

944 

945 Raises 

946 ------ 

947 ConflictingDefinitionError 

948 Raised if the record exists in the database (according to primary 

949 key lookup) but is inconsistent with the given one. 

950 

951 Notes 

952 ----- 

953 This method cannot be called within transactions, as it needs to be 

954 able to perform its own transaction to be concurrent. 

955 """ 

956 if conform: 

957 if isinstance(element, str): 

958 element = self.dimensions[element] 

959 record = row if isinstance(row, DimensionRecord) else element.RecordClass.fromDict(row) 

960 else: 

961 # Ignore typing since caller said to trust them with conform=False. 

962 record = row # type: ignore 

963 storage = self._dimensions[element] # type: ignore 

964 return storage.sync(record) 

965 

966 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

967 ) -> Iterator[DatasetType]: 

968 """Iterate over the dataset types whose names match an expression. 

969 

970 Parameters 

971 ---------- 

972 expression : `Any`, optional 

973 An expression that fully or partially identifies the dataset types 

974 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

975 `...` can be used to return all dataset types, and is the default. 

976 See :ref:`daf_butler_dataset_type_expressions` for more 

977 information. 

978 components : `bool`, optional 

979 If `True`, apply all expression patterns to component dataset type 

980 names as well. If `False`, never apply patterns to components. 

981 If `None` (default), apply patterns to components only if their 

982 parent datasets were not matched by the expression. 

983 Fully-specified component datasets (`str` or `DatasetType` 

984 instances) are always included. 

985 

986 Yields 

987 ------ 

988 datasetType : `DatasetType` 

989 A `DatasetType` instance whose name matches ``expression``. 

990 """ 

991 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

992 if wildcard is Ellipsis: 

993 for datasetType in self._datasets: 

994 # The dataset type can no longer be a component 

995 yield datasetType 

996 if components and datasetType.isComposite(): 

997 # Automatically create the component dataset types 

998 for component in datasetType.makeAllComponentDatasetTypes(): 

999 yield component 

1000 return 

1001 done: Set[str] = set() 

1002 for name in wildcard.strings: 

1003 storage = self._datasets.find(name) 

1004 if storage is not None: 

1005 done.add(storage.datasetType.name) 

1006 yield storage.datasetType 

1007 if wildcard.patterns: 

1008 # If components (the argument) is None, we'll save component 

1009 # dataset that we might want to match, but only if their parents 

1010 # didn't get included. 

1011 componentsForLater = [] 

1012 for registeredDatasetType in self._datasets: 

1013 # Components are not stored in registry so expand them here 

1014 allDatasetTypes = [registeredDatasetType] \ 

1015 + registeredDatasetType.makeAllComponentDatasetTypes() 

1016 for datasetType in allDatasetTypes: 

1017 if datasetType.name in done: 

1018 continue 

1019 parentName, componentName = datasetType.nameAndComponent() 

1020 if componentName is not None and not components: 

1021 if components is None and parentName not in done: 

1022 componentsForLater.append(datasetType) 

1023 continue 

1024 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1025 done.add(datasetType.name) 

1026 yield datasetType 

1027 # Go back and try to match saved components. 

1028 for datasetType in componentsForLater: 

1029 parentName, _ = datasetType.nameAndComponent() 

1030 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1031 yield datasetType 

1032 

1033 def queryCollections(self, expression: Any = ..., 

1034 datasetType: Optional[DatasetType] = None, 

1035 collectionType: Optional[CollectionType] = None, 

1036 flattenChains: bool = False, 

1037 includeChains: Optional[bool] = None) -> Iterator[str]: 

1038 """Iterate over the collections whose names match an expression. 

1039 

1040 Parameters 

1041 ---------- 

1042 expression : `Any`, optional 

1043 An expression that fully or partially identifies the collections 

1044 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1045 `...` can be used to return all collections, and is the default. 

1046 See :ref:`daf_butler_collection_expressions` for more 

1047 information. 

1048 datasetType : `DatasetType`, optional 

1049 If provided, only yield collections that should be searched for 

1050 this dataset type according to ``expression``. If this is 

1051 not provided, any dataset type restrictions in ``expression`` are 

1052 ignored. 

1053 collectionType : `CollectionType`, optional 

1054 If provided, only yield collections of this type. 

1055 flattenChains : `bool`, optional 

1056 If `True` (`False` is default), recursively yield the child 

1057 collections of matching `~CollectionType.CHAINED` collections. 

1058 includeChains : `bool`, optional 

1059 If `True`, yield records for matching `~CollectionType.CHAINED` 

1060 collections. Default is the opposite of ``flattenChains``: include 

1061 either CHAINED collections or their children, but not both. 

1062 

1063 Yields 

1064 ------ 

1065 collection : `str` 

1066 The name of a collection that matches ``expression``. 

1067 """ 

1068 query = CollectionQuery.fromExpression(expression) 

1069 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1070 flattenChains=flattenChains, includeChains=includeChains): 

1071 yield record.name 

1072 

1073 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1074 """Return a `QueryBuilder` instance capable of constructing and 

1075 managing more complex queries than those obtainable via `Registry` 

1076 interfaces. 

1077 

1078 This is an advanced interface; downstream code should prefer 

1079 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1080 are sufficient. 

1081 

1082 Parameters 

1083 ---------- 

1084 summary : `QuerySummary` 

1085 Object describing and categorizing the full set of dimensions that 

1086 will be included in the query. 

1087 

1088 Returns 

1089 ------- 

1090 builder : `QueryBuilder` 

1091 Object that can be used to construct and perform advanced queries. 

1092 """ 

1093 return QueryBuilder(summary=summary, 

1094 collections=self._collections, 

1095 dimensions=self._dimensions, 

1096 datasets=self._datasets) 

1097 

1098 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1099 dataId: Optional[DataId] = None, 

1100 datasets: Any = None, 

1101 collections: Any = None, 

1102 where: Optional[str] = None, 

1103 expand: bool = True, 

1104 components: Optional[bool] = None, 

1105 **kwargs: Any) -> Iterator[DataCoordinate]: 

1106 """Query for and iterate over data IDs matching user-provided criteria. 

1107 

1108 Parameters 

1109 ---------- 

1110 dimensions : `Dimension` or `str`, or iterable thereof 

1111 The dimensions of the data IDs to yield, as either `Dimension` 

1112 instances or `str`. Will be automatically expanded to a complete 

1113 `DimensionGraph`. 

1114 dataId : `dict` or `DataCoordinate`, optional 

1115 A data ID whose key-value pairs are used as equality constraints 

1116 in the query. 

1117 datasets : `Any`, optional 

1118 An expression that fully or partially identifies dataset types 

1119 that should constrain the yielded data IDs. For example, including 

1120 "raw" here would constrain the yielded ``instrument``, 

1121 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1122 those for which at least one "raw" dataset exists in 

1123 ``collections``. Allowed types include `DatasetType`, `str`, 

1124 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1125 expressions, `...` is not permitted - it doesn't make sense to 

1126 constrain data IDs on the existence of *all* datasets. 

1127 See :ref:`daf_butler_dataset_type_expressions` for more 

1128 information. 

1129 collections: `Any`, optional 

1130 An expression that fully or partially identifies the collections 

1131 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1132 thereof. `...` can be used to return all collections. Must be 

1133 provided if ``datasets`` is, and is ignored if it is not. See 

1134 :ref:`daf_butler_collection_expressions` for more information. 

1135 where : `str`, optional 

1136 A string expression similar to a SQL WHERE clause. May involve 

1137 any column of a dimension table or (as a shortcut for the primary 

1138 key column of a dimension table) dimension name. See 

1139 :ref:`daf_butler_dimension_expressions` for more information. 

1140 expand : `bool`, optional 

1141 If `True` (default) yield `DataCoordinate` instances for which 

1142 `~DataCoordinate.hasRecords` is guaranteed to return `True`, 

1143 performing extra database fetches as necessary. 

1144 components : `bool`, optional 

1145 If `True`, apply all dataset expression patterns to component 

1146 dataset type names as well. If `False`, never apply patterns to 

1147 components. If `None` (default), apply patterns to components only 

1148 if their parent datasets were not matched by the expression. 

1149 Fully-specified component datasets (`str` or `DatasetType` 

1150 instances) are always included. 

1151 **kwargs 

1152 Additional keyword arguments are forwarded to 

1153 `DataCoordinate.standardize` when processing the ``dataId`` 

1154 argument (and may be used to provide a constraining data ID even 

1155 when the ``dataId`` argument is `None`). 

1156 

1157 Yields 

1158 ------ 

1159 dataId : `DataCoordinate` 

1160 Data IDs matching the given query parameters. Order is 

1161 unspecified. 

1162 """ 

1163 dimensions = iterable(dimensions) 

1164 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1165 standardizedDatasetTypes = set() 

1166 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1167 if datasets is not None: 

1168 if collections is None: 

1169 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1170 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1171 requestedDimensionNames.update(datasetType.dimensions.names) 

1172 # If any matched dataset type is a component, just operate on 

1173 # its parent instead, because Registry doesn't know anything 

1174 # about what components exist, and here (unlike queryDatasets) 

1175 # we don't care about returning them. 

1176 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1177 if componentName is not None: 

1178 datasetType = self.getDatasetType(parentDatasetTypeName) 

1179 standardizedDatasetTypes.add(datasetType) 

1180 # Preprocess collections expression in case the original included 

1181 # single-pass iterators (we'll want to use it multiple times 

1182 # below). 

1183 collections = CollectionQuery.fromExpression(collections) 

1184 

1185 summary = QuerySummary( 

1186 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1187 dataId=standardizedDataId, 

1188 expression=where, 

1189 ) 

1190 builder = self.makeQueryBuilder(summary) 

1191 for datasetType in standardizedDatasetTypes: 

1192 builder.joinDataset(datasetType, collections, isResult=False) 

1193 query = builder.finish() 

1194 predicate = query.predicate() 

1195 for row in self._db.query(query.sql): 

1196 if predicate(row): 

1197 result = query.extractDataId(row) 

1198 if expand: 

1199 yield self.expandDataId( 

1200 result, 

1201 records=standardizedDataId.records, 

1202 ) 

1203 else: 

1204 yield result 

1205 

1206 def queryDatasets(self, datasetType: Any, *, 

1207 collections: Any, 

1208 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1209 dataId: Optional[DataId] = None, 

1210 where: Optional[str] = None, 

1211 deduplicate: bool = False, 

1212 expand: bool = True, 

1213 components: Optional[bool] = None, 

1214 **kwargs: Any) -> Iterator[DatasetRef]: 

1215 """Query for and iterate over dataset references matching user-provided 

1216 criteria. 

1217 

1218 Parameters 

1219 ---------- 

1220 datasetType 

1221 An expression that fully or partially identifies the dataset types 

1222 to be queried. Allowed types include `DatasetType`, `str`, 

1223 `re.Pattern`, and iterables thereof. The special value `...` can 

1224 be used to query all dataset types. See 

1225 :ref:`daf_butler_dataset_type_expressions` for more information. 

1226 collections 

1227 An expression that fully or partially identifies the collections 

1228 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1229 thereof. `...` can be used to return all collections. See 

1230 :ref:`daf_butler_collection_expressions` for more information. 

1231 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1232 Dimensions to include in the query (in addition to those used 

1233 to identify the queried dataset type(s)), either to constrain 

1234 the resulting datasets to those for which a matching dimension 

1235 exists, or to relate the dataset type's dimensions to dimensions 

1236 referenced by the ``dataId`` or ``where`` arguments. 

1237 dataId : `dict` or `DataCoordinate`, optional 

1238 A data ID whose key-value pairs are used as equality constraints 

1239 in the query. 

1240 where : `str`, optional 

1241 A string expression similar to a SQL WHERE clause. May involve 

1242 any column of a dimension table or (as a shortcut for the primary 

1243 key column of a dimension table) dimension name. See 

1244 :ref:`daf_butler_dimension_expressions` for more information. 

1245 deduplicate : `bool`, optional 

1246 If `True` (`False` is default), for each result data ID, only 

1247 yield one `DatasetRef` of each `DatasetType`, from the first 

1248 collection in which a dataset of that dataset type appears 

1249 (according to the order of ``collections`` passed in). If `True`, 

1250 ``collections`` must not contain regular expressions and may not 

1251 be `...`. 

1252 expand : `bool`, optional 

1253 If `True` (default) attach `DataCoordinate` instances for which 

1254 `~DataCoordinate.hasRecords` is guaranteed to return `True`, 

1255 performing extra database fetches as necessary. 

1256 components : `bool`, optional 

1257 If `True`, apply all dataset expression patterns to component 

1258 dataset type names as well. If `False`, never apply patterns to 

1259 components. If `None` (default), apply patterns to components only 

1260 if their parent datasets were not matched by the expression. 

1261 Fully-specified component datasets (`str` or `DatasetType` 

1262 instances) are always included. 

1263 **kwargs 

1264 Additional keyword arguments are forwarded to 

1265 `DataCoordinate.standardize` when processing the ``dataId`` 

1266 argument (and may be used to provide a constraining data ID even 

1267 when the ``dataId`` argument is `None`). 

1268 

1269 Yields 

1270 ------ 

1271 ref : `DatasetRef` 

1272 Dataset references matching the given query criteria. These 

1273 are grouped by `DatasetType` if the query evaluates to multiple 

1274 dataset types, but order is otherwise unspecified. 

1275 

1276 Raises 

1277 ------ 

1278 TypeError 

1279 Raised when the arguments are incompatible, such as when a 

1280 collection wildcard is passed when ``deduplicate`` is `True`. 

1281 

1282 Notes 

1283 ----- 

1284 When multiple dataset types are queried in a single call, the 

1285 results of this operation are equivalent to querying for each dataset 

1286 type separately in turn, and no information about the relationships 

1287 between datasets of different types is included. In contexts where 

1288 that kind of information is important, the recommended pattern is to 

1289 use `queryDimensions` to first obtain data IDs (possibly with the 

1290 desired dataset types and collections passed as constraints to the 

1291 query), and then use multiple (generally much simpler) calls to 

1292 `queryDatasets` with the returned data IDs passed as constraints. 

1293 """ 

1294 # Standardize the collections expression. 

1295 if deduplicate: 

1296 collections = CollectionSearch.fromExpression(collections) 

1297 else: 

1298 collections = CollectionQuery.fromExpression(collections) 

1299 # Standardize and expand the data ID provided as a constraint. 

1300 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1301 

1302 # We can only query directly if given a non-component DatasetType 

1303 # instance. If we were given an expression or str or a component 

1304 # DatasetType instance, we'll populate this dict, recurse, and return. 

1305 # If we already have a non-component DatasetType, it will remain None 

1306 # and we'll run the query directly. 

1307 composition: Optional[ 

1308 Dict[ 

1309 DatasetType, # parent dataset type 

1310 List[Optional[str]] # component name, or None for parent 

1311 ] 

1312 ] = None 

1313 if not isinstance(datasetType, DatasetType): 

1314 # We were given a dataset type expression (which may be as simple 

1315 # as a str). Loop over all matching datasets, delegating handling 

1316 # of the `components` argument to queryDatasetTypes, as we populate 

1317 # the composition dict. 

1318 composition = defaultdict(list) 

1319 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1320 parentName, componentName = trueDatasetType.nameAndComponent() 

1321 if componentName is not None: 

1322 parentDatasetType = self.getDatasetType(parentName) 

1323 composition.setdefault(parentDatasetType, []).append(componentName) 

1324 else: 

1325 composition.setdefault(trueDatasetType, []).append(None) 

1326 elif datasetType.isComponent(): 

1327 # We were given a true DatasetType instance, but it's a component. 

1328 # the composition dict will have exactly one item. 

1329 parentName, componentName = datasetType.nameAndComponent() 

1330 parentDatasetType = self.getDatasetType(parentName) 

1331 composition = {parentDatasetType: [componentName]} 

1332 if composition is not None: 

1333 # We need to recurse. Do that once for each parent dataset type. 

1334 for parentDatasetType, componentNames in composition.items(): 

1335 for parentRef in self.queryDatasets(parentDatasetType, collections=collections, 

1336 dimensions=dimensions, dataId=standardizedDataId, 

1337 where=where, deduplicate=deduplicate): 

1338 # Loop over components, yielding one for each one for each 

1339 # one requested. 

1340 for componentName in componentNames: 

1341 if componentName is None: 

1342 yield parentRef 

1343 else: 

1344 yield parentRef.makeComponentRef(componentName) 

1345 return 

1346 # If we get here, there's no need to recurse (or we are already 

1347 # recursing; there can only ever be one level of recursion). 

1348 

1349 # The full set of dimensions in the query is the combination of those 

1350 # needed for the DatasetType and those explicitly requested, if any. 

1351 requestedDimensionNames = set(datasetType.dimensions.names) 

1352 if dimensions is not None: 

1353 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1354 # Construct the summary structure needed to construct a QueryBuilder. 

1355 summary = QuerySummary( 

1356 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1357 dataId=standardizedDataId, 

1358 expression=where, 

1359 ) 

1360 builder = self.makeQueryBuilder(summary) 

1361 # Add the dataset subquery to the query, telling the QueryBuilder to 

1362 # include the rank of the selected collection in the results only if we 

1363 # need to deduplicate. Note that if any of the collections are 

1364 # actually wildcard expressions, and we've asked for deduplication, 

1365 # this will raise TypeError for us. 

1366 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1367 return 

1368 query = builder.finish() 

1369 predicate = query.predicate() 

1370 if not deduplicate: 

1371 # No need to de-duplicate across collections. 

1372 for row in self._db.query(query.sql): 

1373 if predicate(row): 

1374 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1375 if expand: 

1376 dataId = self.expandDataId( 

1377 dataId, 

1378 records=standardizedDataId.records 

1379 ) 

1380 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1381 else: 

1382 # For each data ID, yield only the DatasetRef with the lowest 

1383 # collection rank. 

1384 bestRefs = {} 

1385 bestRanks: Dict[DataCoordinate, int] = {} 

1386 for row in self._db.query(query.sql): 

1387 if predicate(row): 

1388 ref, rank = query.extractDatasetRef(row, datasetType) 

1389 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1390 assert rank is not None 

1391 if rank < bestRank: 

1392 bestRefs[ref.dataId] = ref 

1393 bestRanks[ref.dataId] = rank 

1394 # If caller requested expanded data IDs, we defer that until here 

1395 # so we do as little expansion as possible. 

1396 if expand: 

1397 for ref in bestRefs.values(): 

1398 dataId = self.expandDataId( 

1399 ref.dataId, 

1400 records=standardizedDataId.records 

1401 ) 

1402 yield ref.expanded(dataId) 

1403 else: 

1404 yield from bestRefs.values() 

1405 

1406 storageClasses: StorageClassFactory 

1407 """All storage classes known to the registry (`StorageClassFactory`). 

1408 """