Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ("Registry", "AmbiguousDatasetError", "ConflictingDefinitionError", "OrphanedRecordError") 

25 

26import contextlib 

27import sys 

28from typing import ( 

29 Any, 

30 Iterable, 

31 Iterator, 

32 List, 

33 Mapping, 

34 Optional, 

35 Set, 

36 Type, 

37 TYPE_CHECKING, 

38 Union, 

39) 

40 

41import sqlalchemy 

42 

43from ..core import ( 

44 Config, 

45 DataCoordinate, 

46 DataId, 

47 DatasetRef, 

48 DatasetType, 

49 Dimension, 

50 DimensionElement, 

51 DimensionGraph, 

52 DimensionRecord, 

53 DimensionUniverse, 

54 ExpandedDataCoordinate, 

55 StorageClassFactory, 

56) 

57from ..core import ddl 

58from ..core.utils import doImport, iterable, transactional 

59from ._config import RegistryConfig 

60from .queries import ( 

61 DatasetRegistryStorage, 

62 QueryBuilder, 

63 QuerySummary, 

64) 

65from .tables import makeRegistryTableSpecs 

66from ._collectionType import CollectionType 

67from .wildcards import CollectionQuery, CollectionSearch 

68 

69if TYPE_CHECKING: 69 ↛ 70line 69 didn't jump to line 70, because the condition on line 69 was never true

70 from ..butlerConfig import ButlerConfig 

71 from ..core import ( 

72 Quantum 

73 ) 

74 from .interfaces import ( 

75 CollectionManager, 

76 Database, 

77 OpaqueTableStorageManager, 

78 DimensionRecordStorageManager, 

79 ) 

80 

81 

82class AmbiguousDatasetError(Exception): 

83 """Exception raised when a `DatasetRef` has no ID and a `Registry` 

84 operation requires one. 

85 """ 

86 

87 

88class ConflictingDefinitionError(Exception): 

89 """Exception raised when trying to insert a database record when a 

90 conflicting record already exists. 

91 """ 

92 

93 

94class OrphanedRecordError(Exception): 

95 """Exception raised when trying to remove or modify a database record 

96 that is still being used in some other table. 

97 """ 

98 

99 

100def _checkAndGetId(ref: DatasetRef) -> int: 

101 """Return the ID of the given `DatasetRef`, or raise if it is `None`. 

102 

103 This trivial function exists to allow operations that would otherwise be 

104 natural list comprehensions to check that the ID is not `None` as well. 

105 

106 Parameters 

107 ---------- 

108 ref : `DatasetRef` 

109 Dataset reference. 

110 

111 Returns 

112 ------- 

113 id : `int` 

114 ``ref.id`` 

115 

116 Raises 

117 ------ 

118 AmbiguousDatasetError 

119 Raised if ``ref.id`` is `None`. 

120 """ 

121 if ref.id is None: 

122 raise AmbiguousDatasetError("Dataset ID must not be `None`.") 

123 return ref.id 

124 

125 

126class Registry: 

127 """Registry interface. 

128 

129 Parameters 

130 ---------- 

131 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

132 Registry configuration 

133 """ 

134 

135 defaultConfigFile = None 

136 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

137 absolute path. Can be None if no defaults specified. 

138 """ 

139 

140 @classmethod 

141 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

142 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

143 """Create `Registry` subclass instance from `config`. 

144 

145 Uses ``registry.cls`` from `config` to determine which subclass to 

146 instantiate. 

147 

148 Parameters 

149 ---------- 

150 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

151 Registry configuration 

152 create : `bool`, optional 

153 Assume empty Registry and create a new one. 

154 butlerRoot : `str`, optional 

155 Path to the repository root this `Registry` will manage. 

156 writeable : `bool`, optional 

157 If `True` (default) create a read-write connection to the database. 

158 

159 Returns 

160 ------- 

161 registry : `Registry` (subclass) 

162 A new `Registry` subclass instance. 

163 """ 

164 if not isinstance(config, RegistryConfig): 

165 if isinstance(config, str) or isinstance(config, Config): 

166 config = RegistryConfig(config) 

167 else: 

168 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

169 config.replaceRoot(butlerRoot) 

170 DatabaseClass = config.getDatabaseClass() 

171 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

172 namespace=config.get("namespace"), writeable=writeable) 

173 universe = DimensionUniverse(config) 

174 opaque = doImport(config["managers", "opaque"]) 

175 dimensions = doImport(config["managers", "dimensions"]) 

176 collections = doImport(config["managers", "collections"]) 

177 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

178 create=create) 

179 

180 def __init__(self, database: Database, universe: DimensionUniverse, *, 

181 opaque: Type[OpaqueTableStorageManager], 

182 dimensions: Type[DimensionRecordStorageManager], 

183 collections: Type[CollectionManager], 

184 create: bool = False): 

185 self._db = database 

186 self.storageClasses = StorageClassFactory() 

187 with self._db.declareStaticTables(create=create) as context: 

188 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

189 self._collections = collections.initialize(self._db, context) 

190 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, self._collections)) 

191 self._opaque = opaque.initialize(self._db, context) 

192 self._collections.refresh() 

193 # TODO: we shouldn't be grabbing the private connection from the 

194 # Database instance like this, but it's a reasonable way to proceed 

195 # while we transition to using the Database API more. 

196 self._connection = self._db._connection 

197 self._datasetStorage = DatasetRegistryStorage(connection=self._connection, 

198 universe=self.dimensions, 

199 tables=self._tables._asdict(), 

200 collections=self._collections) 

201 self._datasetTypes = {} 

202 

203 def __str__(self) -> str: 

204 return str(self._db) 

205 

206 def __repr__(self) -> str: 

207 return f"Registry({self._db!r}, {self.dimensions!r})" 

208 

209 def isWriteable(self) -> bool: 

210 """Return `True` if this registry allows write operations, and `False` 

211 otherwise. 

212 """ 

213 return self._db.isWriteable() 

214 

215 @property 

216 def dimensions(self) -> DimensionUniverse: 

217 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

218 """ 

219 return self._dimensions.universe 

220 

221 @contextlib.contextmanager 

222 def transaction(self): 

223 """Return a context manager that represents a transaction. 

224 """ 

225 # TODO make savepoint=False the default. 

226 try: 

227 with self._db.transaction(): 

228 yield 

229 except BaseException: 

230 # TODO: this clears the caches sometimes when we wouldn't actually 

231 # need to. Can we avoid that? 

232 self._dimensions.clearCaches() 

233 self._datasetTypes.clear() 

234 raise 

235 

236 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec): 

237 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

238 other data repository client. 

239 

240 Opaque table records can be added via `insertOpaqueData`, retrieved via 

241 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

242 

243 Parameters 

244 ---------- 

245 tableName : `str` 

246 Logical name of the opaque table. This may differ from the 

247 actual name used in the database by a prefix and/or suffix. 

248 spec : `ddl.TableSpec` 

249 Specification for the table to be added. 

250 """ 

251 self._opaque.register(tableName, spec) 

252 

253 @transactional 

254 def insertOpaqueData(self, tableName: str, *data: dict): 

255 """Insert records into an opaque table. 

256 

257 Parameters 

258 ---------- 

259 tableName : `str` 

260 Logical name of the opaque table. Must match the name used in a 

261 previous call to `registerOpaqueTable`. 

262 data 

263 Each additional positional argument is a dictionary that represents 

264 a single row to be added. 

265 """ 

266 self._opaque[tableName].insert(*data) 

267 

268 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

269 """Retrieve records from an opaque table. 

270 

271 Parameters 

272 ---------- 

273 tableName : `str` 

274 Logical name of the opaque table. Must match the name used in a 

275 previous call to `registerOpaqueTable`. 

276 where 

277 Additional keyword arguments are interpreted as equality 

278 constraints that restrict the returned rows (combined with AND); 

279 keyword arguments are column names and values are the values they 

280 must have. 

281 

282 Yields 

283 ------ 

284 row : `dict` 

285 A dictionary representing a single result row. 

286 """ 

287 yield from self._opaque[tableName].fetch(**where) 

288 

289 @transactional 

290 def deleteOpaqueData(self, tableName: str, **where: Any): 

291 """Remove records from an opaque table. 

292 

293 Parameters 

294 ---------- 

295 tableName : `str` 

296 Logical name of the opaque table. Must match the name used in a 

297 previous call to `registerOpaqueTable`. 

298 where 

299 Additional keyword arguments are interpreted as equality 

300 constraints that restrict the deleted rows (combined with AND); 

301 keyword arguments are column names and values are the values they 

302 must have. 

303 """ 

304 self._opaque[tableName].delete(**where) 

305 

306 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED): 

307 """Add a new collection if one with the given name does not exist. 

308 

309 Parameters 

310 ---------- 

311 name : `str` 

312 The name of the collection to create. 

313 type : `CollectionType` 

314 Enum value indicating the type of collection to create. 

315 

316 Notes 

317 ----- 

318 This method cannot be called within transactions, as it needs to be 

319 able to perform its own transaction to be concurrent. 

320 """ 

321 self._collections.register(name, type) 

322 

323 def getCollectionType(self, name: str) -> CollectionType: 

324 """Return an enumeration value indicating the type of the given 

325 collection. 

326 

327 Parameters 

328 ---------- 

329 name : `str` 

330 The name of the collection. 

331 

332 Returns 

333 ------- 

334 type : `CollectionType` 

335 Enum value indicating the type of this collection. 

336 

337 Raises 

338 ------ 

339 MissingCollectionError 

340 Raised if no collection with the given name exists. 

341 """ 

342 return self._collections.find(name).type 

343 

344 def registerRun(self, name: str): 

345 """Add a new run if one with the given name does not exist. 

346 

347 Parameters 

348 ---------- 

349 name : `str` 

350 The name of the run to create. 

351 

352 Notes 

353 ----- 

354 This method cannot be called within transactions, as it needs to be 

355 able to perform its own transaction to be concurrent. 

356 """ 

357 self._collections.register(name, CollectionType.RUN) 

358 

359 def getCollectionChain(self, parent: str) -> CollectionSearch: 

360 """Return the child collections in a `~CollectionType.CHAINED` 

361 collection. 

362 

363 Parameters 

364 ---------- 

365 parent : `str` 

366 Name of the chained collection. Must have already been added via 

367 a call to `Registry.registerCollection`. 

368 

369 Returns 

370 ------- 

371 children : `CollectionSearch` 

372 An object that defines the search path of the collection. 

373 See :ref:`daf_butler_collection_expressions` for more information. 

374 

375 Raises 

376 ------ 

377 MissingCollectionError 

378 Raised if ``parent`` does not exist in the `Registry`. 

379 TypeError 

380 Raised if ``parent`` does not correspond to a 

381 `~CollectionType.CHAINED` collection. 

382 """ 

383 record = self._collections.find(parent) 

384 if record.type is not CollectionType.CHAINED: 

385 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

386 return record.children 

387 

388 def setCollectionChain(self, parent: str, children: Any): 

389 """Define or redefine a `~CollectionType.CHAINED` collection. 

390 

391 Parameters 

392 ---------- 

393 parent : `str` 

394 Name of the chained collection. Must have already been added via 

395 a call to `Registry.registerCollection`. 

396 children : `Any` 

397 An expression defining an ordered search of child collections, 

398 generally an iterable of `str`. Restrictions on the dataset types 

399 to be searched can also be included, by passing mapping or an 

400 iterable containing tuples; see 

401 :ref:`daf_butler_collection_expressions` for more information. 

402 

403 Raises 

404 ------ 

405 MissingCollectionError 

406 Raised when any of the given collections do not exist in the 

407 `Registry`. 

408 TypeError 

409 Raised if ``parent`` does not correspond to a 

410 `~CollectionType.CHAINED` collection. 

411 ValueError 

412 Raised if the given collections contains a cycle. 

413 """ 

414 record = self._collections.find(parent) 

415 if record.type is not CollectionType.CHAINED: 

416 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

417 children = CollectionSearch.fromExpression(children) 

418 if children != record.children: 

419 record.update(self._collections, children) 

420 

421 @transactional 

422 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

423 """ 

424 Add a new `DatasetType` to the Registry. 

425 

426 It is not an error to register the same `DatasetType` twice. 

427 

428 Parameters 

429 ---------- 

430 datasetType : `DatasetType` 

431 The `DatasetType` to be added. 

432 

433 Returns 

434 ------- 

435 inserted : `bool` 

436 `True` if ``datasetType`` was inserted, `False` if an identical 

437 existing `DatsetType` was found. Note that in either case the 

438 DatasetType is guaranteed to be defined in the Registry 

439 consistently with the given definition. 

440 

441 Raises 

442 ------ 

443 ValueError 

444 Raised if the dimensions or storage class are invalid. 

445 ConflictingDefinitionError 

446 Raised if this DatasetType is already registered with a different 

447 definition. 

448 """ 

449 # TODO: this implementation isn't concurrent, except *maybe* in SQLite 

450 # with aggressive locking (where starting a transaction is essentially 

451 # the same as grabbing a full-database lock). Should be reimplemented 

452 # with Database.sync to fix this, but that may require schema changes 

453 # as well so we only have to synchronize one row to know if we have 

454 # inconsistent definitions. 

455 

456 # If the DatasetType is already in the cache, we assume it's already in 

457 # the DB (note that we don't actually provide a way to remove them from 

458 # the DB). 

459 existingDatasetType = self._datasetTypes.get(datasetType.name) 

460 # If it's not in the cache, try to insert it. 

461 if existingDatasetType is None: 

462 try: 

463 with self._db.transaction(): 

464 self._db.insert( 

465 self._tables.dataset_type, 

466 { 

467 "dataset_type_name": datasetType.name, 

468 "storage_class": datasetType.storageClass.name, 

469 } 

470 ) 

471 except sqlalchemy.exc.IntegrityError: 

472 # Insert failed on the only unique constraint on this table: 

473 # dataset_type_name. So now the question is whether the one in 

474 # there is the same as the one we tried to insert. 

475 existingDatasetType = self.getDatasetType(datasetType.name) 

476 else: 

477 # If adding the DatasetType record itself succeeded, add its 

478 # dimensions (if any). We don't guard this in a try block 

479 # because a problem with this insert means the database 

480 # content must be corrupted. 

481 if datasetType.dimensions: 

482 self._db.insert( 

483 self._tables.dataset_type_dimensions, 

484 *[{"dataset_type_name": datasetType.name, 

485 "dimension_name": dimensionName} 

486 for dimensionName in datasetType.dimensions.names] 

487 ) 

488 # Update the cache. 

489 self._datasetTypes[datasetType.name] = datasetType 

490 # Also register component DatasetTypes (if any). 

491 for compName, compStorageClass in datasetType.storageClass.components.items(): 

492 compType = DatasetType(datasetType.componentTypeName(compName), 

493 dimensions=datasetType.dimensions, 

494 storageClass=compStorageClass) 

495 self.registerDatasetType(compType) 

496 # Inserts succeeded, nothing left to do here. 

497 return True 

498 # A DatasetType with this name exists, check if is equal 

499 if datasetType == existingDatasetType: 

500 return False 

501 else: 

502 raise ConflictingDefinitionError(f"DatasetType: {datasetType} != existing {existingDatasetType}") 

503 

504 def getDatasetType(self, name: str) -> DatasetType: 

505 """Get the `DatasetType`. 

506 

507 Parameters 

508 ---------- 

509 name : `str` 

510 Name of the type. 

511 

512 Returns 

513 ------- 

514 type : `DatasetType` 

515 The `DatasetType` associated with the given name. 

516 

517 Raises 

518 ------ 

519 KeyError 

520 Requested named DatasetType could not be found in registry. 

521 """ 

522 datasetType = self._datasetTypes.get(name) 

523 if datasetType is None: 

524 # Get StorageClass from DatasetType table 

525 result = self._db.query( 

526 sqlalchemy.sql.select( 

527 [self._tables.dataset_type.c.storage_class] 

528 ).where( 

529 self._tables.dataset_type.columns.dataset_type_name == name 

530 ) 

531 ).fetchone() 

532 

533 if result is None: 

534 raise KeyError("Could not find entry for datasetType {}".format(name)) 

535 

536 storageClass = self.storageClasses.getStorageClass(result["storage_class"]) 

537 # Get Dimensions (if any) from DatasetTypeDimensions table 

538 result = self._db.query( 

539 sqlalchemy.sql.select( 

540 [self._tables.dataset_type_dimensions.columns.dimension_name] 

541 ).where( 

542 self._tables.dataset_type_dimensions.columns.dataset_type_name == name 

543 ) 

544 ).fetchall() 

545 dimensions = DimensionGraph(self.dimensions, names=(r[0] for r in result) if result else ()) 

546 datasetType = DatasetType(name=name, 

547 storageClass=storageClass, 

548 dimensions=dimensions) 

549 self._datasetTypes[name] = datasetType 

550 return datasetType 

551 

552 def _makeDatasetRefFromRow(self, row: sqlalchemy.engine.RowProxy, 

553 datasetType: Optional[DatasetType] = None, 

554 dataId: Optional[DataCoordinate] = None): 

555 """Construct a DatasetRef from the result of a query on the Dataset 

556 table. 

557 

558 Parameters 

559 ---------- 

560 row : `sqlalchemy.engine.RowProxy`. 

561 Row of a query that contains all columns from the `Dataset` table. 

562 May include additional fields (which will be ignored). 

563 datasetType : `DatasetType`, optional 

564 `DatasetType` associated with this dataset. Will be retrieved 

565 if not provided. If provided, the caller guarantees that it is 

566 already consistent with what would have been retrieved from the 

567 database. 

568 dataId : `DataCoordinate`, optional 

569 Dimensions associated with this dataset. Will be retrieved if not 

570 provided. If provided, the caller guarantees that it is already 

571 consistent with what would have been retrieved from the database. 

572 

573 Returns 

574 ------- 

575 ref : `DatasetRef`. 

576 A new `DatasetRef` instance. 

577 """ 

578 if datasetType is None: 

579 datasetType = self.getDatasetType(row["dataset_type_name"]) 

580 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]] 

581 assert runRecord is not None, "Should be guaranteed by foreign key constraints." 

582 run = runRecord.name 

583 datasetRefHash = row["dataset_ref_hash"] 

584 if dataId is None: 

585 # TODO: should we expand here? 

586 dataId = DataCoordinate.standardize( 

587 row, 

588 graph=datasetType.dimensions, 

589 universe=self.dimensions 

590 ) 

591 # Get components (if present) 

592 components = {} 

593 if datasetType.storageClass.isComposite(): 

594 t = self._tables 

595 columns = list(t.dataset.columns) 

596 columns.append(t.dataset_composition.columns.component_name) 

597 results = self._db.query( 

598 sqlalchemy.sql.select( 

599 columns 

600 ).select_from( 

601 t.dataset.join( 

602 t.dataset_composition, 

603 (t.dataset.columns.dataset_id == t.dataset_composition.columns.component_dataset_id) 

604 ) 

605 ).where( 

606 t.dataset_composition.columns.parent_dataset_id == row["dataset_id"] 

607 ) 

608 ).fetchall() 

609 for result in results: 

610 componentName = result["component_name"] 

611 componentDatasetType = DatasetType( 

612 DatasetType.nameWithComponent(datasetType.name, componentName), 

613 dimensions=datasetType.dimensions, 

614 storageClass=datasetType.storageClass.components[componentName] 

615 ) 

616 components[componentName] = self._makeDatasetRefFromRow(result, dataId=dataId, 

617 datasetType=componentDatasetType) 

618 if not components.keys() <= datasetType.storageClass.components.keys(): 

619 raise RuntimeError( 

620 f"Inconsistency detected between dataset and storage class definitions: " 

621 f"{datasetType.storageClass.name} has components " 

622 f"{set(datasetType.storageClass.components.keys())}, " 

623 f"but dataset has components {set(components.keys())}" 

624 ) 

625 return DatasetRef(datasetType=datasetType, dataId=dataId, id=row["dataset_id"], run=run, 

626 hash=datasetRefHash, components=components) 

627 

628 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

629 collections: Any, **kwds: Any) -> Optional[DatasetRef]: 

630 """Find a dataset given its `DatasetType` and data ID. 

631 

632 This can be used to obtain a `DatasetRef` that permits the dataset to 

633 be read from a `Datastore`. 

634 

635 Parameters 

636 ---------- 

637 datasetType : `DatasetType` or `str` 

638 A `DatasetType` or the name of one. 

639 dataId : `dict` or `DataCoordinate`, optional 

640 A `dict`-like object containing the `Dimension` links that identify 

641 the dataset within a collection. 

642 collections 

643 An expression that fully or partially identifies the collections 

644 to search for the dataset, such as a `str`, `re.Pattern`, or 

645 iterable thereof. `...` can be used to return all collections. 

646 See :ref:`daf_butler_collection_expressions` for more information. 

647 **kwds 

648 Additional keyword arguments passed to 

649 `DataCoordinate.standardize` to convert ``dataId`` to a true 

650 `DataCoordinate` or augment an existing one. 

651 

652 Returns 

653 ------- 

654 ref : `DatasetRef` 

655 A reference to the dataset, or `None` if no matching Dataset 

656 was found. 

657 

658 Raises 

659 ------ 

660 LookupError 

661 Raised if one or more data ID keys are missing. 

662 MissingCollectionError 

663 Raised if any of ``collections`` does not exist in the registry. 

664 """ 

665 if not isinstance(datasetType, DatasetType): 

666 datasetType = self.getDatasetType(datasetType) 

667 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

668 universe=self.dimensions, **kwds) 

669 collections = CollectionSearch.fromExpression(collections) 

670 for collectionRecord in collections.iter(self._collections, datasetType=datasetType): 

671 if collectionRecord.type is CollectionType.TAGGED: 

672 collectionColumn = \ 

673 self._tables.dataset_collection.columns[self._collections.getCollectionForeignKeyName()] 

674 fromClause = self._tables.dataset.join(self._tables.dataset_collection) 

675 elif collectionRecord.type is CollectionType.RUN: 

676 collectionColumn = self._tables.dataset.columns[self._collections.getRunForeignKeyName()] 

677 fromClause = self._tables.dataset 

678 else: 

679 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.") 

680 whereTerms = [ 

681 self._tables.dataset.columns.dataset_type_name == datasetType.name, 

682 collectionColumn == collectionRecord.key, 

683 ] 

684 whereTerms.extend(self._tables.dataset.columns[name] == dataId[name] for name in dataId.keys()) 

685 query = self._tables.dataset.select().select_from( 

686 fromClause 

687 ).where( 

688 sqlalchemy.sql.and_(*whereTerms) 

689 ) 

690 result = self._db.query(query).fetchone() 

691 if result is not None: 

692 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId) 

693 return None 

694 

695 @transactional 

696 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

697 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False 

698 ) -> List[DatasetRef]: 

699 """Insert one or more datasets into the `Registry` 

700 

701 This always adds new datasets; to associate existing datasets with 

702 a new collection, use ``associate``. 

703 

704 Parameters 

705 ---------- 

706 datasetType : `DatasetType` or `str` 

707 A `DatasetType` or the name of one. 

708 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

709 Dimension-based identifiers for the new datasets. 

710 run : `str` 

711 The name of the run that produced the datasets. 

712 producer : `Quantum` 

713 Unit of work that produced the datasets. May be `None` to store 

714 no provenance information, but if present the `Quantum` must 

715 already have been added to the Registry. 

716 recursive : `bool` 

717 If True, recursively add datasets and attach entries for component 

718 datasets as well. 

719 

720 Returns 

721 ------- 

722 refs : `list` of `DatasetRef` 

723 Resolved `DatasetRef` instances for all given data IDs (in the same 

724 order). 

725 

726 Raises 

727 ------ 

728 ConflictingDefinitionError 

729 If a dataset with the same dataset type and data ID as one of those 

730 given already exists in the given collection. 

731 MissingCollectionError 

732 Raised if ``run`` does not exist in the registry. 

733 """ 

734 if not isinstance(datasetType, DatasetType): 

735 datasetType = self.getDatasetType(datasetType) 

736 rows = [] 

737 refs = [] 

738 runRecord = self._collections.find(run) 

739 base = { 

740 "dataset_type_name": datasetType.name, 

741 self._collections.getRunForeignKeyName(): runRecord.key, 

742 "quantum_id": producer.id if producer is not None else None, 

743 } 

744 # Expand data IDs and build both a list of unresolved DatasetRefs 

745 # and a list of dictionary rows for the dataset table. 

746 for dataId in dataIds: 

747 ref = DatasetRef(datasetType, self.expandDataId(dataId, graph=datasetType.dimensions)) 

748 refs.append(ref) 

749 row = dict(base, dataset_ref_hash=ref.hash) 

750 for dimension, value in ref.dataId.full.items(): 

751 row[dimension.name] = value 

752 rows.append(row) 

753 # Actually insert into the dataset table. 

754 try: 

755 datasetIds = self._db.insert(self._tables.dataset, *rows, returnIds=True) 

756 except sqlalchemy.exc.IntegrityError as err: 

757 raise ConflictingDefinitionError( 

758 f"Constraint violation while inserting datasets into run {run}. " 

759 f"This usually means that one or more datasets with the same dataset type and data ID " 

760 f"already exist in the collection, but it may be a foreign key violation." 

761 ) from err 

762 # Resolve the DatasetRefs with the autoincrement IDs we generated. 

763 refs = [ref.resolved(id=datasetId, run=run) for datasetId, ref in zip(datasetIds, refs)] 

764 if recursive and datasetType.isComposite(): 

765 # Insert component rows by recursing, and gather a single big list 

766 # of rows to insert into the dataset_composition table. 

767 compositionRows = [] 

768 for componentName in datasetType.storageClass.components: 

769 componentDatasetType = datasetType.makeComponentDatasetType(componentName) 

770 componentRefs = self.insertDatasets(componentDatasetType, 

771 dataIds=(ref.dataId for ref in refs), 

772 run=run, 

773 producer=producer, 

774 recursive=True) 

775 for parentRef, componentRef in zip(refs, componentRefs): 

776 parentRef._components[componentName] = componentRef 

777 compositionRows.append({ 

778 "parent_dataset_id": parentRef.id, 

779 "component_dataset_id": componentRef.id, 

780 "component_name": componentName, 

781 }) 

782 if compositionRows: 

783 self._db.insert(self._tables.dataset_composition, *compositionRows) 

784 return refs 

785 

786 def getDataset(self, id: int, datasetType: Optional[DatasetType] = None, 

787 dataId: Optional[DataCoordinate] = None) -> Optional[DatasetRef]: 

788 """Retrieve a Dataset entry. 

789 

790 Parameters 

791 ---------- 

792 id : `int` 

793 The unique identifier for the Dataset. 

794 datasetType : `DatasetType`, optional 

795 The `DatasetType` of the dataset to retrieve. This is used to 

796 short-circuit retrieving the `DatasetType`, so if provided, the 

797 caller is guaranteeing that it is what would have been retrieved. 

798 dataId : `DataCoordinate`, optional 

799 A `Dimension`-based identifier for the dataset within a 

800 collection, possibly containing additional metadata. This is used 

801 to short-circuit retrieving the dataId, so if provided, the 

802 caller is guaranteeing that it is what would have been retrieved. 

803 

804 Returns 

805 ------- 

806 ref : `DatasetRef` 

807 A ref to the Dataset, or `None` if no matching Dataset 

808 was found. 

809 """ 

810 result = self._db.query( 

811 self._tables.dataset.select().where( 

812 self._tables.dataset.columns.dataset_id == id 

813 ) 

814 ).fetchone() 

815 if result is None: 

816 return None 

817 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId) 

818 

819 @transactional 

820 def removeDataset(self, ref: DatasetRef): 

821 """Remove a dataset from the Registry. 

822 

823 The dataset and all components will be removed unconditionally from 

824 all collections, and any associated `Quantum` records will also be 

825 removed. `Datastore` records will *not* be deleted; the caller is 

826 responsible for ensuring that the dataset has already been removed 

827 from all Datastores. 

828 

829 Parameters 

830 ---------- 

831 ref : `DatasetRef` 

832 Reference to the dataset to be removed. Must include a valid 

833 ``id`` attribute, and should be considered invalidated upon return. 

834 

835 Raises 

836 ------ 

837 AmbiguousDatasetError 

838 Raised if ``ref.id`` is `None`. 

839 OrphanedRecordError 

840 Raised if the dataset is still present in any `Datastore`. 

841 """ 

842 if not ref.id: 

843 raise AmbiguousDatasetError(f"Cannot remove dataset {ref} without ID.") 

844 # Remove component datasets. We assume ``ref.components`` is already 

845 # correctly populated, and rely on ON DELETE CASCADE to remove entries 

846 # from DatasetComposition. 

847 for componentRef in ref.components.values(): 

848 self.removeDataset(componentRef) 

849 

850 # Remove related quanta. We rely on ON DELETE CASCADE to remove any 

851 # related records in dataset_consumers. Note that we permit a Quantum 

852 # to be deleted without removing the datasets it refers to, but do not 

853 # allow a dataset to be deleted without removing the Quanta that refer 

854 # to them. A dataset is still quite usable without provenance, but 

855 # provenance is worthless if it's inaccurate. 

856 t = self._tables 

857 selectProducer = sqlalchemy.sql.select( 

858 [t.dataset.columns.quantum_id] 

859 ).where( 

860 t.dataset.columns.dataset_id == ref.id 

861 ) 

862 selectConsumers = sqlalchemy.sql.select( 

863 [t.dataset_consumers.columns.quantum_id] 

864 ).where( 

865 t.dataset_consumers.columns.dataset_id == ref.id 

866 ) 

867 # TODO: we'd like to use Database.delete here, but it doesn't general 

868 # queries yet. 

869 self._connection.execute( 

870 t.quantum.delete().where( 

871 t.quantum.columns.id.in_(sqlalchemy.sql.union(selectProducer, selectConsumers)) 

872 ) 

873 ) 

874 # Remove the Dataset record itself. We rely on ON DELETE CASCADE to 

875 # remove from DatasetCollection, and assume foreign key violations 

876 # come from DatasetLocation (everything else should have an ON DELETE). 

877 try: 

878 self._connection.execute( 

879 t.dataset.delete().where(t.dataset.c.dataset_id == ref.id) 

880 ) 

881 except sqlalchemy.exc.IntegrityError as err: 

882 raise OrphanedRecordError(f"Dataset {ref} is still present in one or more Datastores.") from err 

883 

884 @transactional 

885 def attachComponent(self, name: str, parent: DatasetRef, component: DatasetRef): 

886 """Attach a component to a dataset. 

887 

888 Parameters 

889 ---------- 

890 name : `str` 

891 Name of the component. 

892 parent : `DatasetRef` 

893 A reference to the parent dataset. Will be updated to reference 

894 the component. 

895 component : `DatasetRef` 

896 A reference to the component dataset. 

897 

898 Raises 

899 ------ 

900 AmbiguousDatasetError 

901 Raised if ``parent.id`` or ``component.id`` is `None`. 

902 """ 

903 # TODO Insert check for component name and type against 

904 # parent.storageClass specified components 

905 if parent.id is None: 

906 raise AmbiguousDatasetError(f"Cannot attach component to dataset {parent} without ID.") 

907 if component.id is None: 

908 raise AmbiguousDatasetError(f"Cannot attach component {component} without ID.") 

909 values = dict(component_name=name, 

910 parent_dataset_id=parent.id, 

911 component_dataset_id=component.id) 

912 self._db.insert(self._tables.dataset_composition, values) 

913 parent._components[name] = component 

914 

915 @transactional 

916 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

917 """Add existing Datasets to a collection, implicitly creating the 

918 collection if it does not already exist. 

919 

920 If a DatasetRef with the same exact ``dataset_id`` is already in a 

921 collection nothing is changed. If a `DatasetRef` with the same 

922 `DatasetType1` and dimension values but with different ``dataset_id`` 

923 exists in the collection, `ValueError` is raised. 

924 

925 Parameters 

926 ---------- 

927 collection : `str` 

928 Indicates the collection the Datasets should be associated with. 

929 refs : iterable of `DatasetRef` 

930 An iterable of resolved `DatasetRef` instances that already exist 

931 in this `Registry`. 

932 recursive : `bool`, optional 

933 If `True`, associate all component datasets as well. Note that 

934 this only associates components that are actually included in the 

935 given `DatasetRef` instances, which may not be the same as those in 

936 the database (especially if they were obtained from 

937 `queryDatasets`, which does not populate `DatasetRef.components`). 

938 

939 Raises 

940 ------ 

941 ConflictingDefinitionError 

942 If a Dataset with the given `DatasetRef` already exists in the 

943 given collection. 

944 AmbiguousDatasetError 

945 Raised if ``any(ref.id is None for ref in refs)``. 

946 MissingCollectionError 

947 Raised if ``collection`` does not exist in the registry. 

948 TypeError 

949 Raise adding new datasets to the given ``collection`` is not 

950 allowed. 

951 """ 

952 collectionRecord = self._collections.find(collection) 

953 if collectionRecord.type is not CollectionType.TAGGED: 

954 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

955 if recursive: 

956 refs = DatasetRef.flatten(refs) 

957 rows = [{"dataset_id": _checkAndGetId(ref), 

958 "dataset_ref_hash": ref.hash, 

959 self._collections.getCollectionForeignKeyName(): collectionRecord.key} 

960 for ref in refs] 

961 try: 

962 self._db.replace(self._tables.dataset_collection, *rows) 

963 except sqlalchemy.exc.IntegrityError as err: 

964 raise ConflictingDefinitionError( 

965 f"Constraint violation while associating datasets with collection {collection}. " 

966 f"This probably means that one or more datasets with the same dataset type and data ID " 

967 f"already exist in the collection, but it may also indicate that the datasets do not exist." 

968 ) from err 

969 

970 @transactional 

971 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

972 """Remove existing Datasets from a collection. 

973 

974 ``collection`` and ``ref`` combinations that are not currently 

975 associated are silently ignored. 

976 

977 Parameters 

978 ---------- 

979 collection : `str` 

980 The collection the Datasets should no longer be associated with. 

981 refs : iterable of `DatasetRef` 

982 An iterable of resolved `DatasetRef` instances that already exist 

983 in this `Registry`. 

984 recursive : `bool`, optional 

985 If `True`, disassociate all component datasets as well. Note that 

986 this only disassociates components that are actually included in 

987 the given `DatasetRef` instances, which may not be the same as 

988 those in the database (especially if they were obtained from 

989 `queryDatasets`, which does not populate `DatasetRef.components`). 

990 

991 Raises 

992 ------ 

993 AmbiguousDatasetError 

994 Raised if ``any(ref.id is None for ref in refs)``. 

995 MissingCollectionError 

996 Raised if ``collection`` does not exist in the registry. 

997 TypeError 

998 Raise adding new datasets to the given ``collection`` is not 

999 allowed. 

1000 """ 

1001 collectionFieldName = self._collections.getCollectionForeignKeyName() 

1002 collectionRecord = self._collections.find(collection) 

1003 if collectionRecord.type is not CollectionType.TAGGED: 

1004 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

1005 "expected TAGGED.") 

1006 if recursive: 

1007 refs = DatasetRef.flatten(refs) 

1008 rows = [{"dataset_id": _checkAndGetId(ref), collectionFieldName: collectionRecord.key} 

1009 for ref in refs] 

1010 self._db.delete(self._tables.dataset_collection, ["dataset_id", collectionFieldName], *rows) 

1011 

1012 @transactional 

1013 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

1014 """Record that a datastore holds the given datasets. 

1015 

1016 Typically used by `Datastore`. 

1017 

1018 Parameters 

1019 ---------- 

1020 datastoreName : `str` 

1021 Name of the datastore holding these datasets. 

1022 refs : `~collections.abc.Iterable` of `DatasetRef` 

1023 References to the datasets. 

1024 

1025 Raises 

1026 ------ 

1027 AmbiguousDatasetError 

1028 Raised if ``any(ref.id is None for ref in refs)``. 

1029 """ 

1030 self._db.insert( 

1031 self._tables.dataset_storage, 

1032 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs] 

1033 ) 

1034 

1035 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]: 

1036 """Retrieve datastore locations for a given dataset. 

1037 

1038 Typically used by `Datastore`. 

1039 

1040 Parameters 

1041 ---------- 

1042 ref : `DatasetRef` 

1043 A reference to the dataset for which to retrieve storage 

1044 information. 

1045 

1046 Returns 

1047 ------- 

1048 datastores : `set` of `str` 

1049 All the matching datastores holding this dataset. Empty set 

1050 if the dataset does not exist anywhere. 

1051 

1052 Raises 

1053 ------ 

1054 AmbiguousDatasetError 

1055 Raised if ``ref.id`` is `None`. 

1056 """ 

1057 table = self._tables.dataset_storage 

1058 result = self._db.query( 

1059 sqlalchemy.sql.select( 

1060 [table.columns.datastore_name] 

1061 ).where( 

1062 table.columns.dataset_id == ref.id 

1063 ) 

1064 ).fetchall() 

1065 return {r["datastore_name"] for r in result} 

1066 

1067 @transactional 

1068 def removeDatasetLocation(self, datastoreName, ref): 

1069 """Remove datastore location associated with this dataset. 

1070 

1071 Typically used by `Datastore` when a dataset is removed. 

1072 

1073 Parameters 

1074 ---------- 

1075 datastoreName : `str` 

1076 Name of this `Datastore`. 

1077 ref : `DatasetRef` 

1078 A reference to the dataset for which information is to be removed. 

1079 

1080 Raises 

1081 ------ 

1082 AmbiguousDatasetError 

1083 Raised if ``ref.id`` is `None`. 

1084 """ 

1085 self._db.delete( 

1086 self._tables.dataset_storage, 

1087 ["dataset_id", "datastore_name"], 

1088 {"dataset_id": _checkAndGetId(ref), "datastore_name": datastoreName} 

1089 ) 

1090 

1091 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1092 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds): 

1093 """Expand a dimension-based data ID to include additional information. 

1094 

1095 Parameters 

1096 ---------- 

1097 dataId : `DataCoordinate` or `dict`, optional 

1098 Data ID to be expanded; augmented and overridden by ``kwds``. 

1099 graph : `DimensionGraph`, optional 

1100 Set of dimensions for the expanded ID. If `None`, the dimensions 

1101 will be inferred from the keys of ``dataId`` and ``kwds``. 

1102 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1103 are silently ignored, providing a way to extract and expand a 

1104 subset of a data ID. 

1105 records : mapping [`DimensionElement`, `DimensionRecord`], optional 

1106 Dimension record data to use before querying the database for that 

1107 data. 

1108 **kwds 

1109 Additional keywords are treated like additional key-value pairs for 

1110 ``dataId``, extending and overriding 

1111 

1112 Returns 

1113 ------- 

1114 expanded : `ExpandedDataCoordinate` 

1115 A data ID that includes full metadata for all of the dimensions it 

1116 identifieds. 

1117 """ 

1118 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds) 

1119 if isinstance(standardized, ExpandedDataCoordinate): 

1120 return standardized 

1121 elif isinstance(dataId, ExpandedDataCoordinate): 

1122 records = dict(records) if records is not None else {} 

1123 records.update(dataId.records) 

1124 else: 

1125 records = dict(records) if records is not None else {} 

1126 keys = dict(standardized) 

1127 for element in standardized.graph.primaryKeyTraversalOrder: 

1128 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1129 if record is ...: 

1130 storage = self._dimensions[element] 

1131 record = storage.fetch(keys) 

1132 records[element] = record 

1133 if record is not None: 

1134 keys.update((d, getattr(record, d.name)) for d in element.implied) 

1135 else: 

1136 if element in standardized.graph.required: 

1137 raise LookupError( 

1138 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1139 ) 

1140 records.update((d, None) for d in element.implied) 

1141 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

1142 

1143 def insertDimensionData(self, element: Union[DimensionElement, str], 

1144 *data: Union[dict, DimensionRecord], 

1145 conform: bool = True): 

1146 """Insert one or more dimension records into the database. 

1147 

1148 Parameters 

1149 ---------- 

1150 element : `DimensionElement` or `str` 

1151 The `DimensionElement` or name thereof that identifies the table 

1152 records will be inserted into. 

1153 data : `dict` or `DimensionRecord` (variadic) 

1154 One or more records to insert. 

1155 conform : `bool`, optional 

1156 If `False` (`True` is default) perform no checking or conversions, 

1157 and assume that ``element`` is a `DimensionElement` instance and 

1158 ``data`` is a one or more `DimensionRecord` instances of the 

1159 appropriate subclass. 

1160 """ 

1161 if conform: 

1162 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1163 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1164 for row in data] 

1165 else: 

1166 records = data 

1167 storage = self._dimensions[element] 

1168 storage.insert(*records) 

1169 

1170 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]: 

1171 """Iterate over the dataset types whose names match an expression. 

1172 

1173 Parameters 

1174 ---------- 

1175 expression : `Any`, optional 

1176 An expression that fully or partially identifies the dataset types 

1177 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1178 `...` can be used to return all dataset types, and is the default. 

1179 See :ref:`daf_butler_dataset_type_expressions` for more 

1180 information. 

1181 

1182 Yields 

1183 ------ 

1184 datasetType : `DatasetType` 

1185 A `DatasetType` instance whose name matches ``expression``. 

1186 """ 

1187 yield from self._datasetStorage.fetchDatasetTypes(expression) 

1188 

1189 def queryCollections(self, expression: Any = ..., 

1190 datasetType: Optional[DatasetType] = None, 

1191 collectionType: Optional[CollectionType] = None, 

1192 flattenChains: bool = False, 

1193 includeChains: Optional[bool] = None) -> Iterator[str]: 

1194 """Iterate over the collections whose names match an expression. 

1195 

1196 Parameters 

1197 ---------- 

1198 expression : `Any`, optional 

1199 An expression that fully or partially identifies the collections 

1200 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1201 `...` can be used to return all collections, and is the default. 

1202 See :ref:`daf_butler_collection_expressions` for more 

1203 information. 

1204 datasetType : `DatasetType`, optional 

1205 If provided, only yield collections that should be searched for 

1206 this dataset type according to ``expression``. If this is 

1207 not provided, any dataset type restrictions in ``expression`` are 

1208 ignored. 

1209 collectionType : `CollectionType`, optional 

1210 If provided, only yield collections of this type. 

1211 flattenChains : `bool`, optional 

1212 If `True` (`False` is default), recursively yield the child 

1213 collections of matching `~CollectionType.CHAINED` collections. 

1214 includeChains : `bool`, optional 

1215 If `True`, yield records for matching `~CollectionType.CHAINED` 

1216 collections. Default is the opposite of ``flattenChains``: include 

1217 either CHAINED collections or their children, but not both. 

1218 

1219 Yields 

1220 ------ 

1221 collection : `str` 

1222 The name of a collection that matches ``expression``. 

1223 """ 

1224 query = CollectionQuery.fromExpression(expression) 

1225 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1226 flattenChains=flattenChains, includeChains=includeChains): 

1227 yield record.name 

1228 

1229 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1230 """Return a `QueryBuilder` instance capable of constructing and 

1231 managing more complex queries than those obtainable via `Registry` 

1232 interfaces. 

1233 

1234 This is an advanced interface; downstream code should prefer 

1235 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1236 are sufficient. 

1237 

1238 Parameters 

1239 ---------- 

1240 summary : `QuerySummary` 

1241 Object describing and categorizing the full set of dimensions that 

1242 will be included in the query. 

1243 

1244 Returns 

1245 ------- 

1246 builder : `QueryBuilder` 

1247 Object that can be used to construct and perform advanced queries. 

1248 """ 

1249 return QueryBuilder(connection=self._connection, summary=summary, 

1250 dimensionStorage=self._dimensions, 

1251 datasetStorage=self._datasetStorage) 

1252 

1253 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1254 dataId: Optional[DataId] = None, 

1255 datasets: Any = None, 

1256 collections: Any = None, 

1257 where: Optional[str] = None, 

1258 expand: bool = True, 

1259 **kwds) -> Iterator[DataCoordinate]: 

1260 """Query for and iterate over data IDs matching user-provided criteria. 

1261 

1262 Parameters 

1263 ---------- 

1264 dimensions : `Dimension` or `str`, or iterable thereof 

1265 The dimensions of the data IDs to yield, as either `Dimension` 

1266 instances or `str`. Will be automatically expanded to a complete 

1267 `DimensionGraph`. 

1268 dataId : `dict` or `DataCoordinate`, optional 

1269 A data ID whose key-value pairs are used as equality constraints 

1270 in the query. 

1271 datasets : `Any`, optional 

1272 An expression that fully or partially identifies dataset types 

1273 that should constrain the yielded data IDs. For example, including 

1274 "raw" here would constrain the yielded ``instrument``, 

1275 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1276 those for which at least one "raw" dataset exists in 

1277 ``collections``. Allowed types include `DatasetType`, `str`, 

1278 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1279 expressions, `...` is not permitted - it doesn't make sense to 

1280 constrain data IDs on the existence of *all* datasets. 

1281 See :ref:`daf_butler_dataset_type_expressions` for more 

1282 information. 

1283 collections: `Any`, optional 

1284 An expression that fully or partially identifies the collections 

1285 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1286 thereof. `...` can be used to return all collections. Must be 

1287 provided if ``datasets`` is, and is ignored if it is not. See 

1288 :ref:`daf_butler_collection_expressions` for more information. 

1289 where : `str`, optional 

1290 A string expression similar to a SQL WHERE clause. May involve 

1291 any column of a dimension table or (as a shortcut for the primary 

1292 key column of a dimension table) dimension name. See 

1293 :ref:`daf_butler_dimension_expressions` for more information. 

1294 expand : `bool`, optional 

1295 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1296 minimal `DataCoordinate` base-class instances. 

1297 kwds 

1298 Additional keyword arguments are forwarded to 

1299 `DataCoordinate.standardize` when processing the ``dataId`` 

1300 argument (and may be used to provide a constraining data ID even 

1301 when the ``dataId`` argument is `None`). 

1302 

1303 Yields 

1304 ------ 

1305 dataId : `DataCoordinate` 

1306 Data IDs matching the given query parameters. Order is 

1307 unspecified. 

1308 """ 

1309 dimensions = iterable(dimensions) 

1310 standardizedDataId = self.expandDataId(dataId, **kwds) 

1311 standardizedDatasetTypes = [] 

1312 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1313 if datasets is not None: 

1314 if collections is None: 

1315 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1316 for datasetType in self._datasetStorage.fetchDatasetTypes(datasets): 

1317 requestedDimensionNames.update(datasetType.dimensions.names) 

1318 standardizedDatasetTypes.append(datasetType) 

1319 # Preprocess collections expression in case the original included 

1320 # single-pass iterators (we'll want to use it multiple times 

1321 # below). 

1322 collections = CollectionQuery.fromExpression(collections) 

1323 

1324 summary = QuerySummary( 

1325 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1326 dataId=standardizedDataId, 

1327 expression=where, 

1328 ) 

1329 builder = self.makeQueryBuilder(summary) 

1330 for datasetType in standardizedDatasetTypes: 

1331 builder.joinDataset(datasetType, collections, isResult=False) 

1332 query = builder.finish() 

1333 predicate = query.predicate() 

1334 for row in query.execute(): 

1335 if predicate(row): 

1336 result = query.extractDataId(row) 

1337 if expand: 

1338 yield self.expandDataId(result, records=standardizedDataId.records) 

1339 else: 

1340 yield result 

1341 

1342 def queryDatasets(self, datasetType: Any, *, 

1343 collections: Any, 

1344 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1345 dataId: Optional[DataId] = None, 

1346 where: Optional[str] = None, 

1347 deduplicate: bool = False, 

1348 expand: bool = True, 

1349 **kwds) -> Iterator[DatasetRef]: 

1350 """Query for and iterate over dataset references matching user-provided 

1351 criteria. 

1352 

1353 Parameters 

1354 ---------- 

1355 datasetType 

1356 An expression that fully or partially identifies the dataset types 

1357 to be queried. Allowed types include `DatasetType`, `str`, 

1358 `re.Pattern`, and iterables thereof. The special value `...` can 

1359 be used to query all dataset types. See 

1360 :ref:`daf_butler_dataset_type_expressions` for more information. 

1361 collections 

1362 An expression that fully or partially identifies the collections 

1363 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1364 thereof. `...` can be used to return all collections. See 

1365 :ref:`daf_butler_collection_expressions` for more information. 

1366 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1367 Dimensions to include in the query (in addition to those used 

1368 to identify the queried dataset type(s)), either to constrain 

1369 the resulting datasets to those for which a matching dimension 

1370 exists, or to relate the dataset type's dimensions to dimensions 

1371 referenced by the ``dataId`` or ``where`` arguments. 

1372 dataId : `dict` or `DataCoordinate`, optional 

1373 A data ID whose key-value pairs are used as equality constraints 

1374 in the query. 

1375 where : `str`, optional 

1376 A string expression similar to a SQL WHERE clause. May involve 

1377 any column of a dimension table or (as a shortcut for the primary 

1378 key column of a dimension table) dimension name. See 

1379 :ref:`daf_butler_dimension_expressions` for more information. 

1380 deduplicate : `bool`, optional 

1381 If `True` (`False` is default), for each result data ID, only 

1382 yield one `DatasetRef` of each `DatasetType`, from the first 

1383 collection in which a dataset of that dataset type appears 

1384 (according to the order of ``collections`` passed in). If `True`, 

1385 ``collections`` must not contain regular expressions and may not 

1386 be `...`. 

1387 expand : `bool`, optional 

1388 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1389 minimal `DataCoordinate` base-class instances. 

1390 kwds 

1391 Additional keyword arguments are forwarded to 

1392 `DataCoordinate.standardize` when processing the ``dataId`` 

1393 argument (and may be used to provide a constraining data ID even 

1394 when the ``dataId`` argument is `None`). 

1395 

1396 Yields 

1397 ------ 

1398 ref : `DatasetRef` 

1399 Dataset references matching the given query criteria. These 

1400 are grouped by `DatasetType` if the query evaluates to multiple 

1401 dataset types, but order is otherwise unspecified. 

1402 

1403 Raises 

1404 ------ 

1405 TypeError 

1406 Raised when the arguments are incompatible, such as when a 

1407 collection wildcard is passed when ``deduplicate`` is `True`. 

1408 

1409 Notes 

1410 ----- 

1411 When multiple dataset types are queried in a single call, the 

1412 results of this operation are equivalent to querying for each dataset 

1413 type separately in turn, and no information about the relationships 

1414 between datasets of different types is included. In contexts where 

1415 that kind of information is important, the recommended pattern is to 

1416 use `queryDimensions` to first obtain data IDs (possibly with the 

1417 desired dataset types and collections passed as constraints to the 

1418 query), and then use multiple (generally much simpler) calls to 

1419 `queryDatasets` with the returned data IDs passed as constraints. 

1420 """ 

1421 # Standardize and expand the data ID provided as a constraint. 

1422 standardizedDataId = self.expandDataId(dataId, **kwds) 

1423 # If the datasetType passed isn't actually a DatasetType, expand it 

1424 # (it could be an expression that yields multiple DatasetTypes) and 

1425 # recurse. 

1426 if not isinstance(datasetType, DatasetType): 

1427 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetType): 

1428 yield from self.queryDatasets(trueDatasetType, collections=collections, 

1429 dimensions=dimensions, dataId=standardizedDataId, 

1430 where=where, deduplicate=deduplicate) 

1431 return 

1432 # The full set of dimensions in the query is the combination of those 

1433 # needed for the DatasetType and those explicitly requested, if any. 

1434 requestedDimensionNames = set(datasetType.dimensions.names) 

1435 if dimensions is not None: 

1436 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1437 # Construct the summary structure needed to construct a QueryBuilder. 

1438 summary = QuerySummary( 

1439 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1440 dataId=standardizedDataId, 

1441 expression=where, 

1442 ) 

1443 builder = self.makeQueryBuilder(summary) 

1444 # Add the dataset subquery to the query, telling the QueryBuilder to 

1445 # include the rank of the selected collection in the results only if we 

1446 # need to deduplicate. Note that if any of the collections are 

1447 # actually wildcard expressions, and we've asked for deduplication, 

1448 # this will raise TypeError for us. 

1449 builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate) 

1450 query = builder.finish() 

1451 predicate = query.predicate() 

1452 if not deduplicate or len(collections) == 1: 

1453 # No need to de-duplicate across collections. 

1454 for row in query.execute(): 

1455 if predicate(row): 

1456 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1457 if expand: 

1458 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1459 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1460 else: 

1461 # For each data ID, yield only the DatasetRef with the lowest 

1462 # collection rank. 

1463 bestRefs = {} 

1464 bestRanks = {} 

1465 for row in query.execute(): 

1466 if predicate(row): 

1467 ref, rank = query.extractDatasetRef(row, datasetType) 

1468 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1469 if rank < bestRank: 

1470 bestRefs[ref.dataId] = ref 

1471 bestRanks[ref.dataId] = rank 

1472 # If caller requested expanded data IDs, we defer that until here 

1473 # so we do as little expansion as possible. 

1474 if expand: 

1475 for ref in bestRefs.values(): 

1476 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1477 yield ref.expanded(dataId) 

1478 else: 

1479 yield from bestRefs.values() 

1480 

1481 dimensions: DimensionUniverse 

1482 """The universe of all dimensions known to the registry 

1483 (`DimensionUniverse`). 

1484 """ 

1485 

1486 storageClasses: StorageClassFactory 

1487 """All storage classes known to the registry (`StorageClassFactory`). 

1488 """