Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("Registry", "AmbiguousDatasetError", "ConflictingDefinitionError", "OrphanedRecordError") 

25 

26import contextlib 

27import sys 

28from typing import ( 

29 Any, 

30 FrozenSet, 

31 Iterable, 

32 Iterator, 

33 List, 

34 Mapping, 

35 Optional, 

36 Set, 

37 Type, 

38 TYPE_CHECKING, 

39 Union, 

40) 

41 

42import sqlalchemy 

43 

44from ..core import ( 

45 Config, 

46 DataCoordinate, 

47 DataId, 

48 DatasetRef, 

49 DatasetType, 

50 Dimension, 

51 DimensionElement, 

52 DimensionGraph, 

53 DimensionRecord, 

54 DimensionUniverse, 

55 ExpandedDataCoordinate, 

56 StorageClassFactory, 

57) 

58from ..core import ddl 

59from ..core.utils import doImport, iterable, transactional, NamedKeyDict 

60from ._config import RegistryConfig 

61from .queries import ( 

62 CollectionsExpression, 

63 DatasetRegistryStorage, 

64 DatasetTypeExpression, 

65 QueryBuilder, 

66 QuerySummary, 

67) 

68from .tables import makeRegistryTableSpecs 

69 

70if TYPE_CHECKING: 70 ↛ 71line 70 didn't jump to line 71, because the condition on line 70 was never true

71 from ..butlerConfig import ButlerConfig 

72 from ..core import ( 

73 Quantum 

74 ) 

75 from .interfaces import ( 

76 Database, 

77 OpaqueTableStorageManager, 

78 DimensionRecordStorageManager, 

79 ) 

80 

81 

82class AmbiguousDatasetError(Exception): 

83 """Exception raised when a `DatasetRef` has no ID and a `Registry` 

84 operation requires one. 

85 """ 

86 

87 

88class ConflictingDefinitionError(Exception): 

89 """Exception raised when trying to insert a database record when a 

90 conflicting record already exists. 

91 """ 

92 

93 

94class OrphanedRecordError(Exception): 

95 """Exception raised when trying to remove or modify a database record 

96 that is still being used in some other table. 

97 """ 

98 

99 

100def _expandComponents(refs: Iterable[DatasetRef]) -> Iterator[DatasetRef]: 

101 """Expand an iterable of datasets to include its components. 

102 

103 Parameters 

104 ---------- 

105 refs : iterable of `DatasetRef` 

106 An iterable of `DatasetRef` instances. 

107 

108 Yields 

109 ------ 

110 refs : `DatasetRef` 

111 Recursively expanded datasets. 

112 """ 

113 for ref in refs: 

114 yield ref 

115 yield from _expandComponents(ref.components.values()) 

116 

117 

118def _checkAndGetId(ref: DatasetRef) -> int: 

119 """Return the ID of the given `DatasetRef`, or raise if it is `None`. 

120 

121 This trivial function exists to allow operations that would otherwise be 

122 natural list comprehensions to check that the ID is not `None` as well. 

123 

124 Parameters 

125 ---------- 

126 ref : `DatasetRef` 

127 Dataset reference. 

128 

129 Returns 

130 ------- 

131 id : `int` 

132 ``ref.id`` 

133 

134 Raises 

135 ------ 

136 AmbiguousDatasetError 

137 Raised if ``ref.id`` is `None`. 

138 """ 

139 if ref.id is None: 

140 raise AmbiguousDatasetError("Dataset ID must not be `None`.") 

141 return ref.id 

142 

143 

144class Registry: 

145 """Registry interface. 

146 

147 Parameters 

148 ---------- 

149 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

150 Registry configuration 

151 """ 

152 

153 defaultConfigFile = None 

154 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

155 absolute path. Can be None if no defaults specified. 

156 """ 

157 

158 @classmethod 

159 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

160 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

161 """Create `Registry` subclass instance from `config`. 

162 

163 Uses ``registry.cls`` from `config` to determine which subclass to 

164 instantiate. 

165 

166 Parameters 

167 ---------- 

168 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

169 Registry configuration 

170 create : `bool`, optional 

171 Assume empty Registry and create a new one. 

172 butlerRoot : `str`, optional 

173 Path to the repository root this `Registry` will manage. 

174 writeable : `bool`, optional 

175 If `True` (default) create a read-write connection to the database. 

176 

177 Returns 

178 ------- 

179 registry : `Registry` (subclass) 

180 A new `Registry` subclass instance. 

181 """ 

182 if not isinstance(config, RegistryConfig): 

183 if isinstance(config, str) or isinstance(config, Config): 

184 config = RegistryConfig(config) 

185 else: 

186 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

187 config.replaceRoot(butlerRoot) 

188 DatabaseClass = config.getDatabaseClass() 

189 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

190 namespace=config.get("namespace"), writeable=writeable) 

191 universe = DimensionUniverse(config) 

192 opaque = doImport(config["managers", "opaque"]) 

193 dimensions = doImport(config["managers", "dimensions"]) 

194 return cls(database, universe, dimensions=dimensions, opaque=opaque, create=create) 

195 

196 def __init__(self, database: Database, universe: DimensionUniverse, *, 

197 opaque: Type[OpaqueTableStorageManager], 

198 dimensions: Type[DimensionRecordStorageManager], 

199 create: bool = False): 

200 self._db = database 

201 self.storageClasses = StorageClassFactory() 

202 with self._db.declareStaticTables(create=create) as context: 

203 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

204 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions)) 

205 self._opaque = opaque.initialize(self._db, context) 

206 # TODO: we shouldn't be grabbing the private connection from the 

207 # Database instance like this, but it's a reasonable way to proceed 

208 # while we transition to using the Database API more. 

209 self._connection = self._db._connection 

210 self._datasetStorage = DatasetRegistryStorage(connection=self._connection, 

211 universe=self.dimensions, 

212 tables=self._tables._asdict()) 

213 self._datasetTypes = {} 

214 self._runIdsByName = {} # key = name, value = id 

215 self._runNamesById = {} # key = id, value = name 

216 

217 def __str__(self) -> str: 

218 return str(self._db) 

219 

220 def __repr__(self) -> str: 

221 return f"Registry({self._db!r}, {self.dimensions!r})" 

222 

223 def isWriteable(self) -> bool: 

224 """Return `True` if this registry allows write operations, and `False` 

225 otherwise. 

226 """ 

227 return self._db.isWriteable() 

228 

229 @property 

230 def dimensions(self) -> DimensionUniverse: 

231 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

232 """ 

233 return self._dimensions.universe 

234 

235 @contextlib.contextmanager 

236 def transaction(self): 

237 """Return a context manager that represents a transaction. 

238 """ 

239 # TODO make savepoint=False the default. 

240 try: 

241 with self._db.transaction(): 

242 yield 

243 except BaseException: 

244 # TODO: this clears the caches sometimes when we wouldn't actually 

245 # need to. Can we avoid that? 

246 self._dimensions.clearCaches() 

247 self._datasetTypes.clear() 

248 raise 

249 

250 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec): 

251 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

252 other data repository client. 

253 

254 Opaque table records can be added via `insertOpaqueData`, retrieved via 

255 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

256 

257 Parameters 

258 ---------- 

259 tableName : `str` 

260 Logical name of the opaque table. This may differ from the 

261 actual name used in the database by a prefix and/or suffix. 

262 spec : `ddl.TableSpec` 

263 Specification for the table to be added. 

264 """ 

265 self._opaque.register(tableName, spec) 

266 

267 @transactional 

268 def insertOpaqueData(self, tableName: str, *data: dict): 

269 """Insert records into an opaque table. 

270 

271 Parameters 

272 ---------- 

273 tableName : `str` 

274 Logical name of the opaque table. Must match the name used in a 

275 previous call to `registerOpaqueTable`. 

276 data 

277 Each additional positional argument is a dictionary that represents 

278 a single row to be added. 

279 """ 

280 self._opaque[tableName].insert(*data) 

281 

282 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

283 """Retrieve records from an opaque table. 

284 

285 Parameters 

286 ---------- 

287 tableName : `str` 

288 Logical name of the opaque table. Must match the name used in a 

289 previous call to `registerOpaqueTable`. 

290 where 

291 Additional keyword arguments are interpreted as equality 

292 constraints that restrict the returned rows (combined with AND); 

293 keyword arguments are column names and values are the values they 

294 must have. 

295 

296 Yields 

297 ------ 

298 row : `dict` 

299 A dictionary representing a single result row. 

300 """ 

301 yield from self._opaque[tableName].fetch(**where) 

302 

303 @transactional 

304 def deleteOpaqueData(self, tableName: str, **where: Any): 

305 """Remove records from an opaque table. 

306 

307 Parameters 

308 ---------- 

309 tableName : `str` 

310 Logical name of the opaque table. Must match the name used in a 

311 previous call to `registerOpaqueTable`. 

312 where 

313 Additional keyword arguments are interpreted as equality 

314 constraints that restrict the deleted rows (combined with AND); 

315 keyword arguments are column names and values are the values they 

316 must have. 

317 """ 

318 self._opaque[tableName].delete(**where) 

319 

320 def getAllCollections(self): 

321 """Get names of all the collections found in this repository. 

322 

323 Returns 

324 ------- 

325 collections : `set` of `str` 

326 The collections. 

327 """ 

328 table = self._tables.dataset_collection 

329 result = self._db.query(sqlalchemy.sql.select([table.c.collection]).distinct()).fetchall() 

330 if result is None: 

331 return set() 

332 return {r[0] for r in result} 

333 

334 def registerRun(self, name: str): 

335 """Add a new run if one with the given name does not exist. 

336 

337 Parameters 

338 ---------- 

339 name : `str` 

340 The name of the run to create. 

341 

342 Notes 

343 ----- 

344 This method cannot be called within transactions, as it needs to be 

345 able to perform its own transaction to be concurrent. 

346 """ 

347 id = self._runIdsByName.get(name) 

348 if id is None: 

349 (id,), _ = self._db.sync(self._tables.run, keys={"name": name}, returning=["id"]) 

350 self._runIdsByName[name] = id 

351 self._runNamesById[id] = name 

352 # Assume that if the run is in the cache, it's in the database, because 

353 # right now there's no way to delete them. 

354 

355 def _getRunNameFromId(self, id: int) -> str: 

356 """Return the name of the run associated with the given integer ID. 

357 """ 

358 assert isinstance(id, int) 

359 name = self._runNamesById.get(id) 

360 if name is None: 

361 table = self._tables.run 

362 name = self._db.query( 

363 sqlalchemy.sql.select( 

364 [table.columns.name] 

365 ).select_from( 

366 table 

367 ).where( 

368 table.columns.id == id 

369 ) 

370 ).scalar() 

371 self._runNamesById[id] = name 

372 self._runIdsByName[name] = id 

373 return name 

374 

375 def _getRunIdFromName(self, name: str) -> id: 

376 """Return the integer ID of the run associated with the given name. 

377 """ 

378 assert isinstance(name, str) 

379 id = self._runIdsByName.get(name) 

380 if id is None: 

381 table = self._tables.run 

382 id = self._db.query( 

383 sqlalchemy.sql.select( 

384 [table.columns.id] 

385 ).select_from( 

386 table 

387 ).where( 

388 table.columns.name == name 

389 ) 

390 ).scalar() 

391 self._runNamesById[id] = name 

392 self._runIdsByName[name] = id 

393 return id 

394 

395 @transactional 

396 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

397 """ 

398 Add a new `DatasetType` to the Registry. 

399 

400 It is not an error to register the same `DatasetType` twice. 

401 

402 Parameters 

403 ---------- 

404 datasetType : `DatasetType` 

405 The `DatasetType` to be added. 

406 

407 Returns 

408 ------- 

409 inserted : `bool` 

410 `True` if ``datasetType`` was inserted, `False` if an identical 

411 existing `DatsetType` was found. Note that in either case the 

412 DatasetType is guaranteed to be defined in the Registry 

413 consistently with the given definition. 

414 

415 Raises 

416 ------ 

417 ValueError 

418 Raised if the dimensions or storage class are invalid. 

419 ConflictingDefinitionError 

420 Raised if this DatasetType is already registered with a different 

421 definition. 

422 """ 

423 # TODO: this implementation isn't concurrent, except *maybe* in SQLite 

424 # with aggressive locking (where starting a transaction is essentially 

425 # the same as grabbing a full-database lock). Should be reimplemented 

426 # with Database.sync to fix this, but that may require schema changes 

427 # as well so we only have to synchronize one row to know if we have 

428 # inconsistent definitions. 

429 

430 # If the DatasetType is already in the cache, we assume it's already in 

431 # the DB (note that we don't actually provide a way to remove them from 

432 # the DB). 

433 existingDatasetType = self._datasetTypes.get(datasetType.name) 

434 # If it's not in the cache, try to insert it. 

435 if existingDatasetType is None: 

436 try: 

437 with self._db.transaction(): 

438 self._db.insert( 

439 self._tables.dataset_type, 

440 { 

441 "dataset_type_name": datasetType.name, 

442 "storage_class": datasetType.storageClass.name, 

443 } 

444 ) 

445 except sqlalchemy.exc.IntegrityError: 

446 # Insert failed on the only unique constraint on this table: 

447 # dataset_type_name. So now the question is whether the one in 

448 # there is the same as the one we tried to insert. 

449 existingDatasetType = self.getDatasetType(datasetType.name) 

450 else: 

451 # If adding the DatasetType record itself succeeded, add its 

452 # dimensions (if any). We don't guard this in a try block 

453 # because a problem with this insert means the database 

454 # content must be corrupted. 

455 if datasetType.dimensions: 

456 self._db.insert( 

457 self._tables.dataset_type_dimensions, 

458 *[{"dataset_type_name": datasetType.name, 

459 "dimension_name": dimensionName} 

460 for dimensionName in datasetType.dimensions.names] 

461 ) 

462 # Update the cache. 

463 self._datasetTypes[datasetType.name] = datasetType 

464 # Also register component DatasetTypes (if any). 

465 for compName, compStorageClass in datasetType.storageClass.components.items(): 

466 compType = DatasetType(datasetType.componentTypeName(compName), 

467 dimensions=datasetType.dimensions, 

468 storageClass=compStorageClass) 

469 self.registerDatasetType(compType) 

470 # Inserts succeeded, nothing left to do here. 

471 return True 

472 # A DatasetType with this name exists, check if is equal 

473 if datasetType == existingDatasetType: 

474 return False 

475 else: 

476 raise ConflictingDefinitionError(f"DatasetType: {datasetType} != existing {existingDatasetType}") 

477 

478 def getDatasetType(self, name: str) -> DatasetType: 

479 """Get the `DatasetType`. 

480 

481 Parameters 

482 ---------- 

483 name : `str` 

484 Name of the type. 

485 

486 Returns 

487 ------- 

488 type : `DatasetType` 

489 The `DatasetType` associated with the given name. 

490 

491 Raises 

492 ------ 

493 KeyError 

494 Requested named DatasetType could not be found in registry. 

495 """ 

496 datasetType = self._datasetTypes.get(name) 

497 if datasetType is None: 

498 # Get StorageClass from DatasetType table 

499 result = self._db.query( 

500 sqlalchemy.sql.select( 

501 [self._tables.dataset_type.c.storage_class] 

502 ).where( 

503 self._tables.dataset_type.columns.dataset_type_name == name 

504 ) 

505 ).fetchone() 

506 

507 if result is None: 

508 raise KeyError("Could not find entry for datasetType {}".format(name)) 

509 

510 storageClass = self.storageClasses.getStorageClass(result["storage_class"]) 

511 # Get Dimensions (if any) from DatasetTypeDimensions table 

512 result = self._db.query( 

513 sqlalchemy.sql.select( 

514 [self._tables.dataset_type_dimensions.columns.dimension_name] 

515 ).where( 

516 self._tables.dataset_type_dimensions.columns.dataset_type_name == name 

517 ) 

518 ).fetchall() 

519 dimensions = DimensionGraph(self.dimensions, names=(r[0] for r in result) if result else ()) 

520 datasetType = DatasetType(name=name, 

521 storageClass=storageClass, 

522 dimensions=dimensions) 

523 self._datasetTypes[name] = datasetType 

524 return datasetType 

525 

526 def getAllDatasetTypes(self) -> FrozenSet[DatasetType]: 

527 """Get every registered `DatasetType`. 

528 

529 Returns 

530 ------- 

531 types : `frozenset` of `DatasetType` 

532 Every `DatasetType` in the registry. 

533 """ 

534 # Get all the registered names 

535 result = self._db.query( 

536 sqlalchemy.sql.select( 

537 [self._tables.dataset_type.columns.dataset_type_name] 

538 ) 

539 ).fetchall() 

540 if result is None: 

541 return frozenset() 

542 datasetTypeNames = [r[0] for r in result] 

543 return frozenset(self.getDatasetType(name) for name in datasetTypeNames) 

544 

545 def _makeDatasetRefFromRow(self, row: sqlalchemy.engine.RowProxy, 

546 datasetType: Optional[DatasetType] = None, 

547 dataId: Optional[DataCoordinate] = None): 

548 """Construct a DatasetRef from the result of a query on the Dataset 

549 table. 

550 

551 Parameters 

552 ---------- 

553 row : `sqlalchemy.engine.RowProxy`. 

554 Row of a query that contains all columns from the `Dataset` table. 

555 May include additional fields (which will be ignored). 

556 datasetType : `DatasetType`, optional 

557 `DatasetType` associated with this dataset. Will be retrieved 

558 if not provided. If provided, the caller guarantees that it is 

559 already consistent with what would have been retrieved from the 

560 database. 

561 dataId : `DataCoordinate`, optional 

562 Dimensions associated with this dataset. Will be retrieved if not 

563 provided. If provided, the caller guarantees that it is already 

564 consistent with what would have been retrieved from the database. 

565 

566 Returns 

567 ------- 

568 ref : `DatasetRef`. 

569 A new `DatasetRef` instance. 

570 """ 

571 if datasetType is None: 

572 datasetType = self.getDatasetType(row["dataset_type_name"]) 

573 run = self._getRunNameFromId(row["run_id"]) 

574 datasetRefHash = row["dataset_ref_hash"] 

575 if dataId is None: 

576 # TODO: should we expand here? 

577 dataId = DataCoordinate.standardize( 

578 row, 

579 graph=datasetType.dimensions, 

580 universe=self.dimensions 

581 ) 

582 # Get components (if present) 

583 components = {} 

584 if datasetType.storageClass.isComposite(): 

585 t = self._tables 

586 columns = list(t.dataset.columns) 

587 columns.append(t.dataset_composition.columns.component_name) 

588 results = self._db.query( 

589 sqlalchemy.sql.select( 

590 columns 

591 ).select_from( 

592 t.dataset.join( 

593 t.dataset_composition, 

594 (t.dataset.columns.dataset_id == t.dataset_composition.columns.component_dataset_id) 

595 ) 

596 ).where( 

597 t.dataset_composition.columns.parent_dataset_id == row["dataset_id"] 

598 ) 

599 ).fetchall() 

600 for result in results: 

601 componentName = result["component_name"] 

602 componentDatasetType = DatasetType( 

603 DatasetType.nameWithComponent(datasetType.name, componentName), 

604 dimensions=datasetType.dimensions, 

605 storageClass=datasetType.storageClass.components[componentName] 

606 ) 

607 components[componentName] = self._makeDatasetRefFromRow(result, dataId=dataId, 

608 datasetType=componentDatasetType) 

609 if not components.keys() <= datasetType.storageClass.components.keys(): 

610 raise RuntimeError( 

611 f"Inconsistency detected between dataset and storage class definitions: " 

612 f"{datasetType.storageClass.name} has components " 

613 f"{set(datasetType.storageClass.components.keys())}, " 

614 f"but dataset has components {set(components.keys())}" 

615 ) 

616 return DatasetRef(datasetType=datasetType, dataId=dataId, id=row["dataset_id"], run=run, 

617 hash=datasetRefHash, components=components) 

618 

619 def find(self, collection: str, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, 

620 **kwds: Any) -> Optional[DatasetRef]: 

621 """Lookup a dataset. 

622 

623 This can be used to obtain a `DatasetRef` that permits the dataset to 

624 be read from a `Datastore`. 

625 

626 Parameters 

627 ---------- 

628 collection : `str` 

629 Identifies the collection to search. 

630 datasetType : `DatasetType` or `str` 

631 A `DatasetType` or the name of one. 

632 dataId : `dict` or `DataCoordinate`, optional 

633 A `dict`-like object containing the `Dimension` links that identify 

634 the dataset within a collection. 

635 **kwds 

636 Additional keyword arguments passed to 

637 `DataCoordinate.standardize` to convert ``dataId`` to a true 

638 `DataCoordinate` or augment an existing one. 

639 

640 Returns 

641 ------- 

642 ref : `DatasetRef` 

643 A ref to the Dataset, or `None` if no matching Dataset 

644 was found. 

645 

646 Raises 

647 ------ 

648 LookupError 

649 If one or more data ID keys are missing. 

650 """ 

651 if not isinstance(datasetType, DatasetType): 

652 datasetType = self.getDatasetType(datasetType) 

653 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

654 universe=self.dimensions, **kwds) 

655 whereTerms = [ 

656 self._tables.dataset.columns.dataset_type_name == datasetType.name, 

657 self._tables.dataset_collection.columns.collection == collection, 

658 ] 

659 whereTerms.extend(self._tables.dataset.columns[name] == dataId[name] for name in dataId.keys()) 

660 result = self._db.query( 

661 self._tables.dataset.select().select_from( 

662 self._tables.dataset.join(self._tables.dataset_collection) 

663 ).where( 

664 sqlalchemy.sql.and_(*whereTerms) 

665 ) 

666 ).fetchone() 

667 if result is None: 

668 return None 

669 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId) 

670 

671 @transactional 

672 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

673 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False 

674 ) -> List[DatasetRef]: 

675 """Insert one or more datasets into the `Registry` 

676 

677 This always adds new datasets; to associate existing datasets with 

678 a new collection, use ``associate``. 

679 

680 Parameters 

681 ---------- 

682 datasetType : `DatasetType` or `str` 

683 A `DatasetType` or the name of one. 

684 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

685 Dimension-based identifiers for the new datasets. 

686 run : `str` 

687 The name of the run that produced the datasets. 

688 producer : `Quantum` 

689 Unit of work that produced the datasets. May be `None` to store 

690 no provenance information, but if present the `Quantum` must 

691 already have been added to the Registry. 

692 recursive : `bool` 

693 If True, recursively add datasets and attach entries for component 

694 datasets as well. 

695 

696 Returns 

697 ------- 

698 refs : `list` of `DatasetRef` 

699 Resolved `DatasetRef` instances for all given data IDs (in the same 

700 order). 

701 ConflictingDefinitionError 

702 If a dataset with the same dataset type and data ID as one of those 

703 given already exists in the given collection. 

704 """ 

705 if not isinstance(datasetType, DatasetType): 

706 datasetType = self.getDatasetType(datasetType) 

707 rows = [] 

708 refs = [] 

709 base = { 

710 "dataset_type_name": datasetType.name, 

711 "run_id": self._getRunIdFromName(run), 

712 "quantum_id": producer.id if producer is not None else None, 

713 } 

714 # Expand data IDs and build both a list of unresolved DatasetRefs 

715 # and a list of dictionary rows for the dataset table. 

716 for dataId in dataIds: 

717 ref = DatasetRef(datasetType, self.expandDataId(dataId, graph=datasetType.dimensions)) 

718 refs.append(ref) 

719 row = dict(base, dataset_ref_hash=ref.hash) 

720 for dimension, value in ref.dataId.full.items(): 

721 row[dimension.name] = value 

722 rows.append(row) 

723 # Actually insert into the dataset table. 

724 datasetIds = self._db.insert(self._tables.dataset, *rows, returnIds=True) 

725 # Resolve the DatasetRefs with the autoincrement IDs we generated. 

726 refs = [ref.resolved(id=datasetId, run=run) for datasetId, ref in zip(datasetIds, refs)] 

727 # Associate the datasets with the run as a collection. Note that we 

728 # do this before inserting component datasets so recursing doesn't try 

729 # to associate those twice. 

730 self.associate(run, refs) 

731 if recursive and datasetType.isComposite(): 

732 # Insert component rows by recursing, and gather a single big list 

733 # of rows to insert into the dataset_composition table. 

734 compositionRows = [] 

735 for componentName in datasetType.storageClass.components: 

736 componentDatasetType = datasetType.makeComponentDatasetType(componentName) 

737 componentRefs = self.insertDatasets(componentDatasetType, 

738 dataIds=(ref.dataId for ref in refs), 

739 run=run, 

740 producer=producer, 

741 recursive=True) 

742 for parentRef, componentRef in zip(refs, componentRefs): 

743 parentRef._components[componentName] = componentRef 

744 compositionRows.append({ 

745 "parent_dataset_id": parentRef.id, 

746 "component_dataset_id": componentRef.id, 

747 "component_name": componentName, 

748 }) 

749 if compositionRows: 

750 self._db.insert(self._tables.dataset_composition, *compositionRows) 

751 return refs 

752 

753 def getDataset(self, id: int, datasetType: Optional[DatasetType] = None, 

754 dataId: Optional[DataCoordinate] = None) -> Optional[DatasetRef]: 

755 """Retrieve a Dataset entry. 

756 

757 Parameters 

758 ---------- 

759 id : `int` 

760 The unique identifier for the Dataset. 

761 datasetType : `DatasetType`, optional 

762 The `DatasetType` of the dataset to retrieve. This is used to 

763 short-circuit retrieving the `DatasetType`, so if provided, the 

764 caller is guaranteeing that it is what would have been retrieved. 

765 dataId : `DataCoordinate`, optional 

766 A `Dimension`-based identifier for the dataset within a 

767 collection, possibly containing additional metadata. This is used 

768 to short-circuit retrieving the dataId, so if provided, the 

769 caller is guaranteeing that it is what would have been retrieved. 

770 

771 Returns 

772 ------- 

773 ref : `DatasetRef` 

774 A ref to the Dataset, or `None` if no matching Dataset 

775 was found. 

776 """ 

777 result = self._db.query( 

778 self._tables.dataset.select().where( 

779 self._tables.dataset.columns.dataset_id == id 

780 ) 

781 ).fetchone() 

782 if result is None: 

783 return None 

784 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId) 

785 

786 @transactional 

787 def removeDataset(self, ref: DatasetRef): 

788 """Remove a dataset from the Registry. 

789 

790 The dataset and all components will be removed unconditionally from 

791 all collections, and any associated `Quantum` records will also be 

792 removed. `Datastore` records will *not* be deleted; the caller is 

793 responsible for ensuring that the dataset has already been removed 

794 from all Datastores. 

795 

796 Parameters 

797 ---------- 

798 ref : `DatasetRef` 

799 Reference to the dataset to be removed. Must include a valid 

800 ``id`` attribute, and should be considered invalidated upon return. 

801 

802 Raises 

803 ------ 

804 AmbiguousDatasetError 

805 Raised if ``ref.id`` is `None`. 

806 OrphanedRecordError 

807 Raised if the dataset is still present in any `Datastore`. 

808 """ 

809 if not ref.id: 

810 raise AmbiguousDatasetError(f"Cannot remove dataset {ref} without ID.") 

811 # Remove component datasets. We assume ``ref.components`` is already 

812 # correctly populated, and rely on ON DELETE CASCADE to remove entries 

813 # from DatasetComposition. 

814 for componentRef in ref.components.values(): 

815 self.removeDataset(componentRef) 

816 

817 # Remove related quanta. We rely on ON DELETE CASCADE to remove any 

818 # related records in dataset_consumers. Note that we permit a Quantum 

819 # to be deleted without removing the datasets it refers to, but do not 

820 # allow a dataset to be deleted without removing the Quanta that refer 

821 # to them. A dataset is still quite usable without provenance, but 

822 # provenance is worthless if it's inaccurate. 

823 t = self._tables 

824 selectProducer = sqlalchemy.sql.select( 

825 [t.dataset.columns.quantum_id] 

826 ).where( 

827 t.dataset.columns.dataset_id == ref.id 

828 ) 

829 selectConsumers = sqlalchemy.sql.select( 

830 [t.dataset_consumers.columns.quantum_id] 

831 ).where( 

832 t.dataset_consumers.columns.dataset_id == ref.id 

833 ) 

834 # TODO: we'd like to use Database.delete here, but it doesn't general 

835 # queries yet. 

836 self._connection.execute( 

837 t.quantum.delete().where( 

838 t.quantum.columns.id.in_(sqlalchemy.sql.union(selectProducer, selectConsumers)) 

839 ) 

840 ) 

841 # Remove the Dataset record itself. We rely on ON DELETE CASCADE to 

842 # remove from DatasetCollection, and assume foreign key violations 

843 # come from DatasetLocation (everything else should have an ON DELETE). 

844 try: 

845 self._connection.execute( 

846 t.dataset.delete().where(t.dataset.c.dataset_id == ref.id) 

847 ) 

848 except sqlalchemy.exc.IntegrityError as err: 

849 raise OrphanedRecordError(f"Dataset {ref} is still present in one or more Datastores.") from err 

850 

851 @transactional 

852 def attachComponent(self, name: str, parent: DatasetRef, component: DatasetRef): 

853 """Attach a component to a dataset. 

854 

855 Parameters 

856 ---------- 

857 name : `str` 

858 Name of the component. 

859 parent : `DatasetRef` 

860 A reference to the parent dataset. Will be updated to reference 

861 the component. 

862 component : `DatasetRef` 

863 A reference to the component dataset. 

864 

865 Raises 

866 ------ 

867 AmbiguousDatasetError 

868 Raised if ``parent.id`` or ``component.id`` is `None`. 

869 """ 

870 # TODO Insert check for component name and type against 

871 # parent.storageClass specified components 

872 if parent.id is None: 

873 raise AmbiguousDatasetError(f"Cannot attach component to dataset {parent} without ID.") 

874 if component.id is None: 

875 raise AmbiguousDatasetError(f"Cannot attach component {component} without ID.") 

876 values = dict(component_name=name, 

877 parent_dataset_id=parent.id, 

878 component_dataset_id=component.id) 

879 self._db.insert(self._tables.dataset_composition, values) 

880 parent._components[name] = component 

881 

882 @transactional 

883 def associate(self, collection: str, refs: List[DatasetRef]): 

884 """Add existing Datasets to a collection, implicitly creating the 

885 collection if it does not already exist. 

886 

887 If a DatasetRef with the same exact ``dataset_id`` is already in a 

888 collection nothing is changed. If a `DatasetRef` with the same 

889 `DatasetType1` and dimension values but with different ``dataset_id`` 

890 exists in the collection, `ValueError` is raised. 

891 

892 Parameters 

893 ---------- 

894 collection : `str` 

895 Indicates the collection the Datasets should be associated with. 

896 refs : iterable of `DatasetRef` 

897 An iterable of `DatasetRef` instances that already exist in this 

898 `Registry`. All component datasets will be associated with the 

899 collection as well. 

900 

901 Raises 

902 ------ 

903 ConflictingDefinitionError 

904 If a Dataset with the given `DatasetRef` already exists in the 

905 given collection. 

906 AmbiguousDatasetError 

907 Raised if ``any(ref.id is None for ref in refs)``. 

908 """ 

909 rows = [{"dataset_id": _checkAndGetId(ref), 

910 "dataset_ref_hash": ref.hash, 

911 "collection": collection} 

912 for ref in _expandComponents(refs)] 

913 try: 

914 self._db.replace(self._tables.dataset_collection, *rows) 

915 except sqlalchemy.exc.IntegrityError as err: 

916 raise ConflictingDefinitionError( 

917 f"Constraint violation while associating datasets with collection {collection}. " 

918 f"This probably means that one or more datasets with the same dataset type and data ID " 

919 f"already exist in the collection, but it may also indicate that the datasets do not exist." 

920 ) from err 

921 

922 @transactional 

923 def disassociate(self, collection: str, refs: List[DatasetRef]): 

924 """Remove existing Datasets from a collection. 

925 

926 ``collection`` and ``ref`` combinations that are not currently 

927 associated are silently ignored. 

928 

929 Parameters 

930 ---------- 

931 collection : `str` 

932 The collection the Datasets should no longer be associated with. 

933 refs : `list` of `DatasetRef` 

934 A `list` of `DatasetRef` instances that already exist in this 

935 `Registry`. All component datasets will also be removed. 

936 

937 Raises 

938 ------ 

939 AmbiguousDatasetError 

940 Raised if ``any(ref.id is None for ref in refs)``. 

941 """ 

942 rows = [{"dataset_id": _checkAndGetId(ref), "collection": collection} 

943 for ref in _expandComponents(refs)] 

944 self._db.delete(self._tables.dataset_collection, ["dataset_id", "collection"], *rows) 

945 

946 @transactional 

947 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

948 """Record that a datastore holds the given datasets. 

949 

950 Typically used by `Datastore`. 

951 

952 Parameters 

953 ---------- 

954 datastoreName : `str` 

955 Name of the datastore holding these datasets. 

956 refs : `~collections.abc.Iterable` of `DatasetRef` 

957 References to the datasets. 

958 

959 Raises 

960 ------ 

961 AmbiguousDatasetError 

962 Raised if ``any(ref.id is None for ref in refs)``. 

963 """ 

964 self._db.insert( 

965 self._tables.dataset_storage, 

966 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs] 

967 ) 

968 

969 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]: 

970 """Retrieve datastore locations for a given dataset. 

971 

972 Typically used by `Datastore`. 

973 

974 Parameters 

975 ---------- 

976 ref : `DatasetRef` 

977 A reference to the dataset for which to retrieve storage 

978 information. 

979 

980 Returns 

981 ------- 

982 datastores : `set` of `str` 

983 All the matching datastores holding this dataset. Empty set 

984 if the dataset does not exist anywhere. 

985 

986 Raises 

987 ------ 

988 AmbiguousDatasetError 

989 Raised if ``ref.id`` is `None`. 

990 """ 

991 table = self._tables.dataset_storage 

992 result = self._db.query( 

993 sqlalchemy.sql.select( 

994 [table.columns.datastore_name] 

995 ).where( 

996 table.columns.dataset_id == ref.id 

997 ) 

998 ).fetchall() 

999 return {r["datastore_name"] for r in result} 

1000 

1001 @transactional 

1002 def removeDatasetLocation(self, datastoreName, ref): 

1003 """Remove datastore location associated with this dataset. 

1004 

1005 Typically used by `Datastore` when a dataset is removed. 

1006 

1007 Parameters 

1008 ---------- 

1009 datastoreName : `str` 

1010 Name of this `Datastore`. 

1011 ref : `DatasetRef` 

1012 A reference to the dataset for which information is to be removed. 

1013 

1014 Raises 

1015 ------ 

1016 AmbiguousDatasetError 

1017 Raised if ``ref.id`` is `None`. 

1018 """ 

1019 self._db.delete( 

1020 self._tables.dataset_storage, 

1021 ["dataset_id", "datastore_name"], 

1022 {"dataset_id": _checkAndGetId(ref), "datastore_name": datastoreName} 

1023 ) 

1024 

1025 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1026 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds): 

1027 """Expand a dimension-based data ID to include additional information. 

1028 

1029 Parameters 

1030 ---------- 

1031 dataId : `DataCoordinate` or `dict`, optional 

1032 Data ID to be expanded; augmented and overridden by ``kwds``. 

1033 graph : `DimensionGraph`, optional 

1034 Set of dimensions for the expanded ID. If `None`, the dimensions 

1035 will be inferred from the keys of ``dataId`` and ``kwds``. 

1036 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1037 are silently ignored, providing a way to extract and expand a 

1038 subset of a data ID. 

1039 records : mapping [`DimensionElement`, `DimensionRecord`], optional 

1040 Dimension record data to use before querying the database for that 

1041 data. 

1042 **kwds 

1043 Additional keywords are treated like additional key-value pairs for 

1044 ``dataId``, extending and overriding 

1045 

1046 Returns 

1047 ------- 

1048 expanded : `ExpandedDataCoordinate` 

1049 A data ID that includes full metadata for all of the dimensions it 

1050 identifieds. 

1051 """ 

1052 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds) 

1053 if isinstance(standardized, ExpandedDataCoordinate): 

1054 return standardized 

1055 elif isinstance(dataId, ExpandedDataCoordinate): 

1056 records = dict(records) if records is not None else {} 

1057 records.update(dataId.records) 

1058 else: 

1059 records = dict(records) if records is not None else {} 

1060 keys = dict(standardized) 

1061 for element in standardized.graph._primaryKeyTraversalOrder: 

1062 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1063 if record is ...: 

1064 storage = self._dimensions[element] 

1065 record = storage.fetch(keys) 

1066 records[element] = record 

1067 if record is not None: 

1068 keys.update((d, getattr(record, d.name)) for d in element.implied) 

1069 else: 

1070 if element in standardized.graph.required: 

1071 raise LookupError( 

1072 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1073 ) 

1074 records.update((d, None) for d in element.implied) 

1075 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

1076 

1077 def insertDimensionData(self, element: Union[DimensionElement, str], 

1078 *data: Union[dict, DimensionRecord], 

1079 conform: bool = True): 

1080 """Insert one or more dimension records into the database. 

1081 

1082 Parameters 

1083 ---------- 

1084 element : `DimensionElement` or `str` 

1085 The `DimensionElement` or name thereof that identifies the table 

1086 records will be inserted into. 

1087 data : `dict` or `DimensionRecord` (variadic) 

1088 One or more records to insert. 

1089 conform : `bool`, optional 

1090 If `False` (`True` is default) perform no checking or conversions, 

1091 and assume that ``element`` is a `DimensionElement` instance and 

1092 ``data`` is a one or more `DimensionRecord` instances of the 

1093 appropriate subclass. 

1094 """ 

1095 if conform: 

1096 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1097 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1098 for row in data] 

1099 else: 

1100 records = data 

1101 storage = self._dimensions[element] 

1102 storage.insert(*records) 

1103 

1104 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1105 """Return a `QueryBuilder` instance capable of constructing and 

1106 managing more complex queries than those obtainable via `Registry` 

1107 interfaces. 

1108 

1109 This is an advanced `SqlRegistry`-only interface; downstream code 

1110 should prefer `Registry.queryDimensions` and `Registry.queryDatasets` 

1111 whenever those are sufficient. 

1112 

1113 Parameters 

1114 ---------- 

1115 summary: `QuerySummary` 

1116 Object describing and categorizing the full set of dimensions that 

1117 will be included in the query. 

1118 

1119 Returns 

1120 ------- 

1121 builder : `QueryBuilder` 

1122 Object that can be used to construct and perform advanced queries. 

1123 """ 

1124 return QueryBuilder(connection=self._connection, summary=summary, 

1125 dimensionStorage=self._dimensions, 

1126 datasetStorage=self._datasetStorage) 

1127 

1128 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1129 dataId: Optional[DataId] = None, 

1130 datasets: Optional[Mapping[DatasetTypeExpression, CollectionsExpression]] = None, 

1131 where: Optional[str] = None, 

1132 expand: bool = True, 

1133 **kwds) -> Iterator[DataCoordinate]: 

1134 """Query for and iterate over data IDs matching user-provided criteria. 

1135 

1136 Parameters 

1137 ---------- 

1138 dimensions : `Dimension` or `str`, or iterable thereof 

1139 The dimensions of the data IDs to yield, as either `Dimension` 

1140 instances or `str`. Will be automatically expanded to a complete 

1141 `DimensionGraph`. 

1142 dataId : `dict` or `DataCoordinate`, optional 

1143 A data ID whose key-value pairs are used as equality constraints 

1144 in the query. 

1145 datasets : `~collections.abc.Mapping`, optional 

1146 Datasets whose existence in the registry constrain the set of data 

1147 IDs returned. This is a mapping from a dataset type expression 

1148 (a `str` name, a true `DatasetType` instance, a `Like` pattern 

1149 for the name, or ``...`` for all DatasetTypes) to a collections 

1150 expression (a sequence of `str` or `Like` patterns, or `...` for 

1151 all collections). 

1152 where : `str`, optional 

1153 A string expression similar to a SQL WHERE clause. May involve 

1154 any column of a dimension table or (as a shortcut for the primary 

1155 key column of a dimension table) dimension name. 

1156 expand : `bool`, optional 

1157 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1158 minimal `DataCoordinate` base-class instances. 

1159 kwds 

1160 Additional keyword arguments are forwarded to 

1161 `DataCoordinate.standardize` when processing the ``dataId`` 

1162 argument (and may be used to provide a constraining data ID even 

1163 when the ``dataId`` argument is `None`). 

1164 

1165 Yields 

1166 ------ 

1167 dataId : `DataCoordinate` 

1168 Data IDs matching the given query parameters. Order is 

1169 unspecified. 

1170 """ 

1171 dimensions = iterable(dimensions) 

1172 standardizedDataId = self.expandDataId(dataId, **kwds) 

1173 standardizedDatasets = NamedKeyDict() 

1174 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1175 if datasets is not None: 

1176 for datasetTypeExpr, collectionsExpr in datasets.items(): 

1177 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetTypeExpr, 

1178 collections=collectionsExpr, 

1179 dataId=standardizedDataId): 

1180 requestedDimensionNames.update(trueDatasetType.dimensions.names) 

1181 standardizedDatasets[trueDatasetType] = collectionsExpr 

1182 summary = QuerySummary( 

1183 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1184 dataId=standardizedDataId, 

1185 expression=where, 

1186 ) 

1187 builder = self.makeQueryBuilder(summary) 

1188 for datasetType, collections in standardizedDatasets.items(): 

1189 builder.joinDataset(datasetType, collections, isResult=False) 

1190 query = builder.finish() 

1191 predicate = query.predicate() 

1192 for row in query.execute(): 

1193 if predicate(row): 

1194 result = query.extractDataId(row) 

1195 if expand: 

1196 yield self.expandDataId(result, records=standardizedDataId.records) 

1197 else: 

1198 yield result 

1199 

1200 def queryDatasets(self, datasetType: DatasetTypeExpression, *, 

1201 collections: CollectionsExpression, 

1202 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1203 dataId: Optional[DataId] = None, 

1204 where: Optional[str] = None, 

1205 deduplicate: bool = False, 

1206 expand: bool = True, 

1207 **kwds) -> Iterator[DatasetRef]: 

1208 """Query for and iterate over dataset references matching user-provided 

1209 criteria. 

1210 

1211 Parameters 

1212 ---------- 

1213 datasetType : `DatasetType`, `str`, `Like`, or ``...`` 

1214 An expression indicating type(s) of datasets to query for. 

1215 ``...`` may be used to query for all known DatasetTypes. 

1216 Multiple explicitly-provided dataset types cannot be queried in a 

1217 single call to `queryDatasets` even though wildcard expressions 

1218 can, because the results would be identical to chaining the 

1219 iterators produced by multiple calls to `queryDatasets`. 

1220 collections: `~collections.abc.Sequence` of `str` or `Like`, or ``...`` 

1221 An expression indicating the collections to be searched for 

1222 datasets. ``...`` may be passed to search all collections. 

1223 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1224 Dimensions to include in the query (in addition to those used 

1225 to identify the queried dataset type(s)), either to constrain 

1226 the resulting datasets to those for which a matching dimension 

1227 exists, or to relate the dataset type's dimensions to dimensions 

1228 referenced by the ``dataId`` or ``where`` arguments. 

1229 dataId : `dict` or `DataCoordinate`, optional 

1230 A data ID whose key-value pairs are used as equality constraints 

1231 in the query. 

1232 where : `str`, optional 

1233 A string expression similar to a SQL WHERE clause. May involve 

1234 any column of a dimension table or (as a shortcut for the primary 

1235 key column of a dimension table) dimension name. 

1236 deduplicate : `bool`, optional 

1237 If `True` (`False` is default), for each result data ID, only 

1238 yield one `DatasetRef` of each `DatasetType`, from the first 

1239 collection in which a dataset of that dataset type appears 

1240 (according to the order of ``collections`` passed in). Cannot be 

1241 used if any element in ``collections`` is an expression. 

1242 expand : `bool`, optional 

1243 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1244 minimal `DataCoordinate` base-class instances. 

1245 kwds 

1246 Additional keyword arguments are forwarded to 

1247 `DataCoordinate.standardize` when processing the ``dataId`` 

1248 argument (and may be used to provide a constraining data ID even 

1249 when the ``dataId`` argument is `None`). 

1250 

1251 Yields 

1252 ------ 

1253 ref : `DatasetRef` 

1254 Dataset references matching the given query criteria. These 

1255 are grouped by `DatasetType` if the query evaluates to multiple 

1256 dataset types, but order is otherwise unspecified. 

1257 

1258 Raises 

1259 ------ 

1260 TypeError 

1261 Raised when the arguments are incompatible, such as when a 

1262 collection wildcard is pass when ``deduplicate`` is `True`. 

1263 

1264 Notes 

1265 ----- 

1266 When multiple dataset types are queried via a wildcard expression, the 

1267 results of this operation are equivalent to querying for each dataset 

1268 type separately in turn, and no information about the relationships 

1269 between datasets of different types is included. In contexts where 

1270 that kind of information is important, the recommended pattern is to 

1271 use `queryDimensions` to first obtain data IDs (possibly with the 

1272 desired dataset types and collections passed as constraints to the 

1273 query), and then use multiple (generally much simpler) calls to 

1274 `queryDatasets` with the returned data IDs passed as constraints. 

1275 """ 

1276 # Standardize and expand the data ID provided as a constraint. 

1277 standardizedDataId = self.expandDataId(dataId, **kwds) 

1278 # If the datasetType passed isn't actually a DatasetType, expand it 

1279 # (it could be an expression that yields multiple DatasetTypes) and 

1280 # recurse. 

1281 if not isinstance(datasetType, DatasetType): 

1282 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetType, 

1283 collections=collections, 

1284 dataId=standardizedDataId): 

1285 yield from self.queryDatasets(trueDatasetType, collections=collections, 

1286 dimensions=dimensions, dataId=standardizedDataId, 

1287 where=where, deduplicate=deduplicate) 

1288 return 

1289 # The full set of dimensions in the query is the combination of those 

1290 # needed for the DatasetType and those explicitly requested, if any. 

1291 requestedDimensionNames = set(datasetType.dimensions.names) 

1292 if dimensions is not None: 

1293 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1294 # Construct the summary structure needed to construct a QueryBuilder. 

1295 summary = QuerySummary( 

1296 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1297 dataId=standardizedDataId, 

1298 expression=where, 

1299 ) 

1300 builder = self.makeQueryBuilder(summary) 

1301 # Add the dataset subquery to the query, telling the QueryBuilder to 

1302 # include the rank of the selected collection in the results only if we 

1303 # need to deduplicate. Note that if any of the collections are 

1304 # actually wildcard expressions, and we've asked for deduplication, 

1305 # this will raise TypeError for us. 

1306 builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate) 

1307 query = builder.finish() 

1308 predicate = query.predicate() 

1309 if not deduplicate or len(collections) == 1: 

1310 # No need to de-duplicate across collections. 

1311 for row in query.execute(): 

1312 if predicate(row): 

1313 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1314 if expand: 

1315 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1316 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1317 else: 

1318 # For each data ID, yield only the DatasetRef with the lowest 

1319 # collection rank. 

1320 bestRefs = {} 

1321 bestRanks = {} 

1322 for row in query.execute(): 

1323 if predicate(row): 

1324 ref, rank = query.extractDatasetRef(row, datasetType) 

1325 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1326 if rank < bestRank: 

1327 bestRefs[ref.dataId] = ref 

1328 bestRanks[ref.dataId] = rank 

1329 # If caller requested expanded data IDs, we defer that until here 

1330 # so we do as little expansion as possible. 

1331 if expand: 

1332 for ref in bestRefs.values(): 

1333 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1334 yield ref.expanded(dataId) 

1335 else: 

1336 yield from bestRefs.values() 

1337 

1338 dimensions: DimensionUniverse 

1339 """The universe of all dimensions known to the registry 

1340 (`DimensionUniverse`). 

1341 """ 

1342 

1343 storageClasses: StorageClassFactory 

1344 """All storage classes known to the registry (`StorageClassFactory`). 

1345 """