Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "AmbiguousDatasetError", 

26 "ConflictingDefinitionError", 

27 "ConsistentDataIds", 

28 "InconsistentDataIdError", 

29 "OrphanedRecordError", 

30 "Registry", 

31) 

32 

33import contextlib 

34from dataclasses import dataclass 

35import sys 

36from typing import ( 

37 Any, 

38 Iterable, 

39 Iterator, 

40 List, 

41 Mapping, 

42 Optional, 

43 Set, 

44 Type, 

45 TYPE_CHECKING, 

46 Union, 

47) 

48 

49import sqlalchemy 

50 

51import lsst.sphgeom 

52from ..core import ( 

53 Config, 

54 DataCoordinate, 

55 DataId, 

56 DatasetRef, 

57 DatasetType, 

58 Dimension, 

59 DimensionElement, 

60 DimensionGraph, 

61 DimensionRecord, 

62 DimensionUniverse, 

63 ExpandedDataCoordinate, 

64 FakeDatasetRef, 

65 StorageClassFactory, 

66) 

67from ..core import ddl 

68from ..core.utils import doImport, iterable, transactional 

69from ._config import RegistryConfig 

70from .queries import ( 

71 DatasetRegistryStorage, 

72 QueryBuilder, 

73 QuerySummary, 

74) 

75from .tables import makeRegistryTableSpecs 

76from ._collectionType import CollectionType 

77from .wildcards import CollectionQuery, CollectionSearch 

78from .interfaces import DatabaseConflictError 

79 

80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true

81 from ..butlerConfig import ButlerConfig 

82 from ..core import ( 

83 Quantum 

84 ) 

85 from .interfaces import ( 

86 CollectionManager, 

87 Database, 

88 OpaqueTableStorageManager, 

89 DimensionRecordStorageManager, 

90 ) 

91 

92 

93@dataclass 

94class ConsistentDataIds: 

95 """A struct used to report relationships between data IDs by 

96 `Registry.relateDataIds`. 

97 

98 If an instance of this class is returned (instead of `None`), the data IDs 

99 are "not inconsistent" - any keys they have in common have the same value, 

100 and any spatial or temporal relationships they have at least might involve 

101 an overlap. To capture this, any instance of `ConsistentDataIds` coerces 

102 to `True` in boolean contexts. 

103 """ 

104 

105 overlaps: bool 

106 """If `True`, the data IDs have at least one key in common, associated with 

107 the same value. 

108 

109 Note that data IDs are not inconsistent even if overlaps is `False` - they 

110 may simply have no keys in common, which means they cannot have 

111 inconsistent values for any keys. They may even be equal, in the case that 

112 both data IDs are empty. 

113 

114 This field does _not_ indicate whether a spatial or temporal overlap 

115 relationship exists. 

116 """ 

117 

118 contains: bool 

119 """If `True`, all keys in the first data ID are in the second, and are 

120 associated with the same values. 

121 

122 This includes case where the first data ID is empty. 

123 """ 

124 

125 within: bool 

126 """If `True`, all keys in the second data ID are in the first, and are 

127 associated with the same values. 

128 

129 This includes case where the second data ID is empty. 

130 """ 

131 

132 @property 

133 def equal(self) -> bool: 

134 """If `True`, the two data IDs are the same. 

135 

136 Data IDs are equal if they have both a `contains` and a `within` 

137 relationship. 

138 """ 

139 return self.contains and self.within 

140 

141 @property 

142 def disjoint(self) -> bool: 

143 """If `True`, the two data IDs have no keys in common. 

144 

145 This is simply the oppose of `overlaps`. Disjoint datasets are by 

146 definition not inconsistent. 

147 """ 

148 return not self.overlaps 

149 

150 def __bool__(self) -> bool: 

151 return True 

152 

153 

154class InconsistentDataIdError(ValueError): 

155 """Exception raised when a data ID contains contradictory key-value pairs, 

156 according to dimension relationships. 

157 

158 This can include the case where the data ID identifies mulitple spatial 

159 regions or timspans that are disjoint. 

160 """ 

161 

162 

163class AmbiguousDatasetError(Exception): 

164 """Exception raised when a `DatasetRef` has no ID and a `Registry` 

165 operation requires one. 

166 """ 

167 

168 

169class ConflictingDefinitionError(Exception): 

170 """Exception raised when trying to insert a database record when a 

171 conflicting record already exists. 

172 """ 

173 

174 

175class OrphanedRecordError(Exception): 

176 """Exception raised when trying to remove or modify a database record 

177 that is still being used in some other table. 

178 """ 

179 

180 

181def _checkAndGetId(ref: DatasetRef) -> int: 

182 """Return the ID of the given `DatasetRef`, or raise if it is `None`. 

183 

184 This trivial function exists to allow operations that would otherwise be 

185 natural list comprehensions to check that the ID is not `None` as well. 

186 

187 Parameters 

188 ---------- 

189 ref : `DatasetRef` 

190 Dataset reference. 

191 

192 Returns 

193 ------- 

194 id : `int` 

195 ``ref.id`` 

196 

197 Raises 

198 ------ 

199 AmbiguousDatasetError 

200 Raised if ``ref.id`` is `None`. 

201 """ 

202 if ref.id is None: 

203 raise AmbiguousDatasetError("Dataset ID must not be `None`.") 

204 return ref.id 

205 

206 

207class Registry: 

208 """Registry interface. 

209 

210 Parameters 

211 ---------- 

212 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

213 Registry configuration 

214 """ 

215 

216 defaultConfigFile = None 

217 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

218 absolute path. Can be None if no defaults specified. 

219 """ 

220 

221 @classmethod 

222 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

223 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

224 """Create `Registry` subclass instance from `config`. 

225 

226 Uses ``registry.cls`` from `config` to determine which subclass to 

227 instantiate. 

228 

229 Parameters 

230 ---------- 

231 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

232 Registry configuration 

233 create : `bool`, optional 

234 Assume empty Registry and create a new one. 

235 butlerRoot : `str`, optional 

236 Path to the repository root this `Registry` will manage. 

237 writeable : `bool`, optional 

238 If `True` (default) create a read-write connection to the database. 

239 

240 Returns 

241 ------- 

242 registry : `Registry` (subclass) 

243 A new `Registry` subclass instance. 

244 """ 

245 if not isinstance(config, RegistryConfig): 

246 if isinstance(config, str) or isinstance(config, Config): 

247 config = RegistryConfig(config) 

248 else: 

249 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

250 config.replaceRoot(butlerRoot) 

251 DatabaseClass = config.getDatabaseClass() 

252 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

253 namespace=config.get("namespace"), writeable=writeable) 

254 universe = DimensionUniverse(config) 

255 opaque = doImport(config["managers", "opaque"]) 

256 dimensions = doImport(config["managers", "dimensions"]) 

257 collections = doImport(config["managers", "collections"]) 

258 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

259 create=create) 

260 

261 def __init__(self, database: Database, universe: DimensionUniverse, *, 

262 opaque: Type[OpaqueTableStorageManager], 

263 dimensions: Type[DimensionRecordStorageManager], 

264 collections: Type[CollectionManager], 

265 create: bool = False): 

266 self._db = database 

267 self.storageClasses = StorageClassFactory() 

268 with self._db.declareStaticTables(create=create) as context: 

269 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

270 self._collections = collections.initialize(self._db, context) 

271 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, self._collections)) 

272 self._opaque = opaque.initialize(self._db, context) 

273 self._collections.refresh() 

274 # TODO: we shouldn't be grabbing the private connection from the 

275 # Database instance like this, but it's a reasonable way to proceed 

276 # while we transition to using the Database API more. 

277 self._connection = self._db._connection 

278 self._datasetStorage = DatasetRegistryStorage(connection=self._connection, 

279 universe=self.dimensions, 

280 tables=self._tables._asdict(), 

281 collections=self._collections) 

282 self._datasetTypes = {} 

283 

284 def __str__(self) -> str: 

285 return str(self._db) 

286 

287 def __repr__(self) -> str: 

288 return f"Registry({self._db!r}, {self.dimensions!r})" 

289 

290 def isWriteable(self) -> bool: 

291 """Return `True` if this registry allows write operations, and `False` 

292 otherwise. 

293 """ 

294 return self._db.isWriteable() 

295 

296 @property 

297 def dimensions(self) -> DimensionUniverse: 

298 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

299 """ 

300 return self._dimensions.universe 

301 

302 @contextlib.contextmanager 

303 def transaction(self): 

304 """Return a context manager that represents a transaction. 

305 """ 

306 # TODO make savepoint=False the default. 

307 try: 

308 with self._db.transaction(): 

309 yield 

310 except BaseException: 

311 # TODO: this clears the caches sometimes when we wouldn't actually 

312 # need to. Can we avoid that? 

313 self._dimensions.clearCaches() 

314 self._datasetTypes.clear() 

315 raise 

316 

317 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec): 

318 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

319 other data repository client. 

320 

321 Opaque table records can be added via `insertOpaqueData`, retrieved via 

322 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

323 

324 Parameters 

325 ---------- 

326 tableName : `str` 

327 Logical name of the opaque table. This may differ from the 

328 actual name used in the database by a prefix and/or suffix. 

329 spec : `ddl.TableSpec` 

330 Specification for the table to be added. 

331 """ 

332 self._opaque.register(tableName, spec) 

333 

334 @transactional 

335 def insertOpaqueData(self, tableName: str, *data: dict): 

336 """Insert records into an opaque table. 

337 

338 Parameters 

339 ---------- 

340 tableName : `str` 

341 Logical name of the opaque table. Must match the name used in a 

342 previous call to `registerOpaqueTable`. 

343 data 

344 Each additional positional argument is a dictionary that represents 

345 a single row to be added. 

346 """ 

347 self._opaque[tableName].insert(*data) 

348 

349 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

350 """Retrieve records from an opaque table. 

351 

352 Parameters 

353 ---------- 

354 tableName : `str` 

355 Logical name of the opaque table. Must match the name used in a 

356 previous call to `registerOpaqueTable`. 

357 where 

358 Additional keyword arguments are interpreted as equality 

359 constraints that restrict the returned rows (combined with AND); 

360 keyword arguments are column names and values are the values they 

361 must have. 

362 

363 Yields 

364 ------ 

365 row : `dict` 

366 A dictionary representing a single result row. 

367 """ 

368 yield from self._opaque[tableName].fetch(**where) 

369 

370 @transactional 

371 def deleteOpaqueData(self, tableName: str, **where: Any): 

372 """Remove records from an opaque table. 

373 

374 Parameters 

375 ---------- 

376 tableName : `str` 

377 Logical name of the opaque table. Must match the name used in a 

378 previous call to `registerOpaqueTable`. 

379 where 

380 Additional keyword arguments are interpreted as equality 

381 constraints that restrict the deleted rows (combined with AND); 

382 keyword arguments are column names and values are the values they 

383 must have. 

384 """ 

385 self._opaque[tableName].delete(**where) 

386 

387 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED): 

388 """Add a new collection if one with the given name does not exist. 

389 

390 Parameters 

391 ---------- 

392 name : `str` 

393 The name of the collection to create. 

394 type : `CollectionType` 

395 Enum value indicating the type of collection to create. 

396 

397 Notes 

398 ----- 

399 This method cannot be called within transactions, as it needs to be 

400 able to perform its own transaction to be concurrent. 

401 """ 

402 self._collections.register(name, type) 

403 

404 def getCollectionType(self, name: str) -> CollectionType: 

405 """Return an enumeration value indicating the type of the given 

406 collection. 

407 

408 Parameters 

409 ---------- 

410 name : `str` 

411 The name of the collection. 

412 

413 Returns 

414 ------- 

415 type : `CollectionType` 

416 Enum value indicating the type of this collection. 

417 

418 Raises 

419 ------ 

420 MissingCollectionError 

421 Raised if no collection with the given name exists. 

422 """ 

423 return self._collections.find(name).type 

424 

425 def registerRun(self, name: str): 

426 """Add a new run if one with the given name does not exist. 

427 

428 Parameters 

429 ---------- 

430 name : `str` 

431 The name of the run to create. 

432 

433 Notes 

434 ----- 

435 This method cannot be called within transactions, as it needs to be 

436 able to perform its own transaction to be concurrent. 

437 """ 

438 self._collections.register(name, CollectionType.RUN) 

439 

440 def getCollectionChain(self, parent: str) -> CollectionSearch: 

441 """Return the child collections in a `~CollectionType.CHAINED` 

442 collection. 

443 

444 Parameters 

445 ---------- 

446 parent : `str` 

447 Name of the chained collection. Must have already been added via 

448 a call to `Registry.registerCollection`. 

449 

450 Returns 

451 ------- 

452 children : `CollectionSearch` 

453 An object that defines the search path of the collection. 

454 See :ref:`daf_butler_collection_expressions` for more information. 

455 

456 Raises 

457 ------ 

458 MissingCollectionError 

459 Raised if ``parent`` does not exist in the `Registry`. 

460 TypeError 

461 Raised if ``parent`` does not correspond to a 

462 `~CollectionType.CHAINED` collection. 

463 """ 

464 record = self._collections.find(parent) 

465 if record.type is not CollectionType.CHAINED: 

466 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

467 return record.children 

468 

469 def setCollectionChain(self, parent: str, children: Any): 

470 """Define or redefine a `~CollectionType.CHAINED` collection. 

471 

472 Parameters 

473 ---------- 

474 parent : `str` 

475 Name of the chained collection. Must have already been added via 

476 a call to `Registry.registerCollection`. 

477 children : `Any` 

478 An expression defining an ordered search of child collections, 

479 generally an iterable of `str`. Restrictions on the dataset types 

480 to be searched can also be included, by passing mapping or an 

481 iterable containing tuples; see 

482 :ref:`daf_butler_collection_expressions` for more information. 

483 

484 Raises 

485 ------ 

486 MissingCollectionError 

487 Raised when any of the given collections do not exist in the 

488 `Registry`. 

489 TypeError 

490 Raised if ``parent`` does not correspond to a 

491 `~CollectionType.CHAINED` collection. 

492 ValueError 

493 Raised if the given collections contains a cycle. 

494 """ 

495 record = self._collections.find(parent) 

496 if record.type is not CollectionType.CHAINED: 

497 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

498 children = CollectionSearch.fromExpression(children) 

499 if children != record.children: 

500 record.update(self._collections, children) 

501 

502 @transactional 

503 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

504 """ 

505 Add a new `DatasetType` to the Registry. 

506 

507 It is not an error to register the same `DatasetType` twice. 

508 

509 Parameters 

510 ---------- 

511 datasetType : `DatasetType` 

512 The `DatasetType` to be added. 

513 

514 Returns 

515 ------- 

516 inserted : `bool` 

517 `True` if ``datasetType`` was inserted, `False` if an identical 

518 existing `DatsetType` was found. Note that in either case the 

519 DatasetType is guaranteed to be defined in the Registry 

520 consistently with the given definition. 

521 

522 Raises 

523 ------ 

524 ValueError 

525 Raised if the dimensions or storage class are invalid. 

526 ConflictingDefinitionError 

527 Raised if this DatasetType is already registered with a different 

528 definition. 

529 """ 

530 # TODO: this implementation isn't concurrent, except *maybe* in SQLite 

531 # with aggressive locking (where starting a transaction is essentially 

532 # the same as grabbing a full-database lock). Should be reimplemented 

533 # with Database.sync to fix this, but that may require schema changes 

534 # as well so we only have to synchronize one row to know if we have 

535 # inconsistent definitions. 

536 

537 # If the DatasetType is already in the cache, we assume it's already in 

538 # the DB (note that we don't actually provide a way to remove them from 

539 # the DB). 

540 existingDatasetType = self._datasetTypes.get(datasetType.name) 

541 # If it's not in the cache, try to insert it. 

542 if existingDatasetType is None: 

543 try: 

544 with self._db.transaction(): 

545 self._db.insert( 

546 self._tables.dataset_type, 

547 { 

548 "dataset_type_name": datasetType.name, 

549 "storage_class": datasetType.storageClass.name, 

550 } 

551 ) 

552 except sqlalchemy.exc.IntegrityError: 

553 # Insert failed on the only unique constraint on this table: 

554 # dataset_type_name. So now the question is whether the one in 

555 # there is the same as the one we tried to insert. 

556 existingDatasetType = self.getDatasetType(datasetType.name) 

557 else: 

558 # If adding the DatasetType record itself succeeded, add its 

559 # dimensions (if any). We don't guard this in a try block 

560 # because a problem with this insert means the database 

561 # content must be corrupted. 

562 if datasetType.dimensions: 

563 self._db.insert( 

564 self._tables.dataset_type_dimensions, 

565 *[{"dataset_type_name": datasetType.name, 

566 "dimension_name": dimensionName} 

567 for dimensionName in datasetType.dimensions.names] 

568 ) 

569 # Update the cache. 

570 self._datasetTypes[datasetType.name] = datasetType 

571 # Also register component DatasetTypes (if any). 

572 for compName, compStorageClass in datasetType.storageClass.components.items(): 

573 compType = DatasetType(datasetType.componentTypeName(compName), 

574 dimensions=datasetType.dimensions, 

575 storageClass=compStorageClass) 

576 self.registerDatasetType(compType) 

577 # Inserts succeeded, nothing left to do here. 

578 return True 

579 # A DatasetType with this name exists, check if is equal 

580 if datasetType == existingDatasetType: 

581 return False 

582 else: 

583 raise ConflictingDefinitionError(f"DatasetType: {datasetType} != existing {existingDatasetType}") 

584 

585 def getDatasetType(self, name: str) -> DatasetType: 

586 """Get the `DatasetType`. 

587 

588 Parameters 

589 ---------- 

590 name : `str` 

591 Name of the type. 

592 

593 Returns 

594 ------- 

595 type : `DatasetType` 

596 The `DatasetType` associated with the given name. 

597 

598 Raises 

599 ------ 

600 KeyError 

601 Requested named DatasetType could not be found in registry. 

602 """ 

603 datasetType = self._datasetTypes.get(name) 

604 if datasetType is None: 

605 # Get StorageClass from DatasetType table 

606 result = self._db.query( 

607 sqlalchemy.sql.select( 

608 [self._tables.dataset_type.c.storage_class] 

609 ).where( 

610 self._tables.dataset_type.columns.dataset_type_name == name 

611 ) 

612 ).fetchone() 

613 

614 if result is None: 

615 raise KeyError("Could not find entry for datasetType {}".format(name)) 

616 

617 storageClass = self.storageClasses.getStorageClass(result["storage_class"]) 

618 # Get Dimensions (if any) from DatasetTypeDimensions table 

619 result = self._db.query( 

620 sqlalchemy.sql.select( 

621 [self._tables.dataset_type_dimensions.columns.dimension_name] 

622 ).where( 

623 self._tables.dataset_type_dimensions.columns.dataset_type_name == name 

624 ) 

625 ).fetchall() 

626 dimensions = DimensionGraph(self.dimensions, names=(r[0] for r in result) if result else ()) 

627 datasetType = DatasetType(name=name, 

628 storageClass=storageClass, 

629 dimensions=dimensions) 

630 self._datasetTypes[name] = datasetType 

631 return datasetType 

632 

633 def _makeDatasetRefFromRow(self, row: sqlalchemy.engine.RowProxy, 

634 datasetType: Optional[DatasetType] = None, 

635 dataId: Optional[DataCoordinate] = None): 

636 """Construct a DatasetRef from the result of a query on the Dataset 

637 table. 

638 

639 Parameters 

640 ---------- 

641 row : `sqlalchemy.engine.RowProxy`. 

642 Row of a query that contains all columns from the `Dataset` table. 

643 May include additional fields (which will be ignored). 

644 datasetType : `DatasetType`, optional 

645 `DatasetType` associated with this dataset. Will be retrieved 

646 if not provided. If provided, the caller guarantees that it is 

647 already consistent with what would have been retrieved from the 

648 database. 

649 dataId : `DataCoordinate`, optional 

650 Dimensions associated with this dataset. Will be retrieved if not 

651 provided. If provided, the caller guarantees that it is already 

652 consistent with what would have been retrieved from the database. 

653 

654 Returns 

655 ------- 

656 ref : `DatasetRef`. 

657 A new `DatasetRef` instance. 

658 """ 

659 if datasetType is None: 

660 datasetType = self.getDatasetType(row["dataset_type_name"]) 

661 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]] 

662 assert runRecord is not None, "Should be guaranteed by foreign key constraints." 

663 run = runRecord.name 

664 datasetRefHash = row["dataset_ref_hash"] 

665 if dataId is None: 

666 # TODO: should we expand here? 

667 dataId = DataCoordinate.standardize( 

668 row, 

669 graph=datasetType.dimensions, 

670 universe=self.dimensions 

671 ) 

672 # Get components (if present) 

673 components = {} 

674 if datasetType.storageClass.isComposite(): 

675 t = self._tables 

676 columns = list(t.dataset.columns) 

677 columns.append(t.dataset_composition.columns.component_name) 

678 results = self._db.query( 

679 sqlalchemy.sql.select( 

680 columns 

681 ).select_from( 

682 t.dataset.join( 

683 t.dataset_composition, 

684 (t.dataset.columns.dataset_id == t.dataset_composition.columns.component_dataset_id) 

685 ) 

686 ).where( 

687 t.dataset_composition.columns.parent_dataset_id == row["dataset_id"] 

688 ) 

689 ).fetchall() 

690 for result in results: 

691 componentName = result["component_name"] 

692 componentDatasetType = DatasetType( 

693 DatasetType.nameWithComponent(datasetType.name, componentName), 

694 dimensions=datasetType.dimensions, 

695 storageClass=datasetType.storageClass.components[componentName] 

696 ) 

697 components[componentName] = self._makeDatasetRefFromRow(result, dataId=dataId, 

698 datasetType=componentDatasetType) 

699 if not components.keys() <= datasetType.storageClass.components.keys(): 

700 raise RuntimeError( 

701 f"Inconsistency detected between dataset and storage class definitions: " 

702 f"{datasetType.storageClass.name} has components " 

703 f"{set(datasetType.storageClass.components.keys())}, " 

704 f"but dataset has components {set(components.keys())}" 

705 ) 

706 return DatasetRef(datasetType=datasetType, dataId=dataId, id=row["dataset_id"], run=run, 

707 hash=datasetRefHash, components=components) 

708 

709 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

710 collections: Any, **kwds: Any) -> Optional[DatasetRef]: 

711 """Find a dataset given its `DatasetType` and data ID. 

712 

713 This can be used to obtain a `DatasetRef` that permits the dataset to 

714 be read from a `Datastore`. 

715 

716 Parameters 

717 ---------- 

718 datasetType : `DatasetType` or `str` 

719 A `DatasetType` or the name of one. 

720 dataId : `dict` or `DataCoordinate`, optional 

721 A `dict`-like object containing the `Dimension` links that identify 

722 the dataset within a collection. 

723 collections 

724 An expression that fully or partially identifies the collections 

725 to search for the dataset, such as a `str`, `re.Pattern`, or 

726 iterable thereof. `...` can be used to return all collections. 

727 See :ref:`daf_butler_collection_expressions` for more information. 

728 **kwds 

729 Additional keyword arguments passed to 

730 `DataCoordinate.standardize` to convert ``dataId`` to a true 

731 `DataCoordinate` or augment an existing one. 

732 

733 Returns 

734 ------- 

735 ref : `DatasetRef` 

736 A reference to the dataset, or `None` if no matching Dataset 

737 was found. 

738 

739 Raises 

740 ------ 

741 LookupError 

742 Raised if one or more data ID keys are missing. 

743 MissingCollectionError 

744 Raised if any of ``collections`` does not exist in the registry. 

745 """ 

746 if not isinstance(datasetType, DatasetType): 

747 datasetType = self.getDatasetType(datasetType) 

748 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

749 universe=self.dimensions, **kwds) 

750 collections = CollectionSearch.fromExpression(collections) 

751 for collectionRecord in collections.iter(self._collections, datasetType=datasetType): 

752 if collectionRecord.type is CollectionType.TAGGED: 

753 collectionColumn = \ 

754 self._tables.dataset_collection.columns[self._collections.getCollectionForeignKeyName()] 

755 fromClause = self._tables.dataset.join(self._tables.dataset_collection) 

756 elif collectionRecord.type is CollectionType.RUN: 

757 collectionColumn = self._tables.dataset.columns[self._collections.getRunForeignKeyName()] 

758 fromClause = self._tables.dataset 

759 else: 

760 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.") 

761 whereTerms = [ 

762 self._tables.dataset.columns.dataset_type_name == datasetType.name, 

763 collectionColumn == collectionRecord.key, 

764 ] 

765 whereTerms.extend(self._tables.dataset.columns[name] == dataId[name] for name in dataId.keys()) 

766 query = self._tables.dataset.select().select_from( 

767 fromClause 

768 ).where( 

769 sqlalchemy.sql.and_(*whereTerms) 

770 ) 

771 result = self._db.query(query).fetchone() 

772 if result is not None: 

773 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId) 

774 return None 

775 

776 @transactional 

777 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

778 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False 

779 ) -> List[DatasetRef]: 

780 """Insert one or more datasets into the `Registry` 

781 

782 This always adds new datasets; to associate existing datasets with 

783 a new collection, use ``associate``. 

784 

785 Parameters 

786 ---------- 

787 datasetType : `DatasetType` or `str` 

788 A `DatasetType` or the name of one. 

789 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

790 Dimension-based identifiers for the new datasets. 

791 run : `str` 

792 The name of the run that produced the datasets. 

793 producer : `Quantum` 

794 Unit of work that produced the datasets. May be `None` to store 

795 no provenance information, but if present the `Quantum` must 

796 already have been added to the Registry. 

797 recursive : `bool` 

798 If True, recursively add datasets and attach entries for component 

799 datasets as well. 

800 

801 Returns 

802 ------- 

803 refs : `list` of `DatasetRef` 

804 Resolved `DatasetRef` instances for all given data IDs (in the same 

805 order). 

806 

807 Raises 

808 ------ 

809 ConflictingDefinitionError 

810 If a dataset with the same dataset type and data ID as one of those 

811 given already exists in the given collection. 

812 MissingCollectionError 

813 Raised if ``run`` does not exist in the registry. 

814 """ 

815 if not isinstance(datasetType, DatasetType): 

816 datasetType = self.getDatasetType(datasetType) 

817 rows = [] 

818 refs = [] 

819 runRecord = self._collections.find(run) 

820 base = { 

821 "dataset_type_name": datasetType.name, 

822 self._collections.getRunForeignKeyName(): runRecord.key, 

823 "quantum_id": producer.id if producer is not None else None, 

824 } 

825 # Expand data IDs and build both a list of unresolved DatasetRefs 

826 # and a list of dictionary rows for the dataset table. 

827 for dataId in dataIds: 

828 ref = DatasetRef(datasetType, self.expandDataId(dataId, graph=datasetType.dimensions)) 

829 refs.append(ref) 

830 row = dict(base, dataset_ref_hash=ref.hash) 

831 for dimension, value in ref.dataId.full.items(): 

832 row[dimension.name] = value 

833 rows.append(row) 

834 # Actually insert into the dataset table. 

835 try: 

836 datasetIds = self._db.insert(self._tables.dataset, *rows, returnIds=True) 

837 except sqlalchemy.exc.IntegrityError as err: 

838 raise ConflictingDefinitionError( 

839 f"Constraint violation while inserting datasets into run {run}. " 

840 f"This usually means that one or more datasets with the same dataset type and data ID " 

841 f"already exist in the collection, but it may be a foreign key violation." 

842 ) from err 

843 # Resolve the DatasetRefs with the autoincrement IDs we generated. 

844 refs = [ref.resolved(id=datasetId, run=run) for datasetId, ref in zip(datasetIds, refs)] 

845 if recursive and datasetType.isComposite(): 

846 # Insert component rows by recursing, and gather a single big list 

847 # of rows to insert into the dataset_composition table. 

848 compositionRows = [] 

849 for componentName in datasetType.storageClass.components: 

850 componentDatasetType = datasetType.makeComponentDatasetType(componentName) 

851 componentRefs = self.insertDatasets(componentDatasetType, 

852 dataIds=(ref.dataId for ref in refs), 

853 run=run, 

854 producer=producer, 

855 recursive=True) 

856 for parentRef, componentRef in zip(refs, componentRefs): 

857 parentRef._components[componentName] = componentRef 

858 compositionRows.append({ 

859 "parent_dataset_id": parentRef.id, 

860 "component_dataset_id": componentRef.id, 

861 "component_name": componentName, 

862 }) 

863 if compositionRows: 

864 self._db.insert(self._tables.dataset_composition, *compositionRows) 

865 return refs 

866 

867 def getDataset(self, id: int, datasetType: Optional[DatasetType] = None, 

868 dataId: Optional[DataCoordinate] = None) -> Optional[DatasetRef]: 

869 """Retrieve a Dataset entry. 

870 

871 Parameters 

872 ---------- 

873 id : `int` 

874 The unique identifier for the Dataset. 

875 datasetType : `DatasetType`, optional 

876 The `DatasetType` of the dataset to retrieve. This is used to 

877 short-circuit retrieving the `DatasetType`, so if provided, the 

878 caller is guaranteeing that it is what would have been retrieved. 

879 dataId : `DataCoordinate`, optional 

880 A `Dimension`-based identifier for the dataset within a 

881 collection, possibly containing additional metadata. This is used 

882 to short-circuit retrieving the dataId, so if provided, the 

883 caller is guaranteeing that it is what would have been retrieved. 

884 

885 Returns 

886 ------- 

887 ref : `DatasetRef` 

888 A ref to the Dataset, or `None` if no matching Dataset 

889 was found. 

890 """ 

891 result = self._db.query( 

892 self._tables.dataset.select().where( 

893 self._tables.dataset.columns.dataset_id == id 

894 ) 

895 ).fetchone() 

896 if result is None: 

897 return None 

898 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId) 

899 

900 @transactional 

901 def removeDataset(self, ref: DatasetRef): 

902 """Remove a dataset from the Registry. 

903 

904 The dataset and all components will be removed unconditionally from 

905 all collections, and any associated `Quantum` records will also be 

906 removed. `Datastore` records will *not* be deleted; the caller is 

907 responsible for ensuring that the dataset has already been removed 

908 from all Datastores. 

909 

910 Parameters 

911 ---------- 

912 ref : `DatasetRef` 

913 Reference to the dataset to be removed. Must include a valid 

914 ``id`` attribute, and should be considered invalidated upon return. 

915 

916 Raises 

917 ------ 

918 AmbiguousDatasetError 

919 Raised if ``ref.id`` is `None`. 

920 OrphanedRecordError 

921 Raised if the dataset is still present in any `Datastore`. 

922 """ 

923 if not ref.id: 

924 raise AmbiguousDatasetError(f"Cannot remove dataset {ref} without ID.") 

925 # Remove component datasets. We assume ``ref.components`` is already 

926 # correctly populated, and rely on ON DELETE CASCADE to remove entries 

927 # from DatasetComposition. 

928 for componentRef in ref.components.values(): 

929 self.removeDataset(componentRef) 

930 

931 # Remove related quanta. We rely on ON DELETE CASCADE to remove any 

932 # related records in dataset_consumers. Note that we permit a Quantum 

933 # to be deleted without removing the datasets it refers to, but do not 

934 # allow a dataset to be deleted without removing the Quanta that refer 

935 # to them. A dataset is still quite usable without provenance, but 

936 # provenance is worthless if it's inaccurate. 

937 t = self._tables 

938 selectProducer = sqlalchemy.sql.select( 

939 [t.dataset.columns.quantum_id] 

940 ).where( 

941 t.dataset.columns.dataset_id == ref.id 

942 ) 

943 selectConsumers = sqlalchemy.sql.select( 

944 [t.dataset_consumers.columns.quantum_id] 

945 ).where( 

946 t.dataset_consumers.columns.dataset_id == ref.id 

947 ) 

948 # TODO: we'd like to use Database.delete here, but it doesn't general 

949 # queries yet. 

950 self._connection.execute( 

951 t.quantum.delete().where( 

952 t.quantum.columns.id.in_(sqlalchemy.sql.union(selectProducer, selectConsumers)) 

953 ) 

954 ) 

955 # Remove the Dataset record itself. We rely on ON DELETE CASCADE to 

956 # remove from DatasetCollection, and assume foreign key violations 

957 # come from DatasetLocation (everything else should have an ON DELETE). 

958 try: 

959 self._connection.execute( 

960 t.dataset.delete().where(t.dataset.c.dataset_id == ref.id) 

961 ) 

962 except sqlalchemy.exc.IntegrityError as err: 

963 raise OrphanedRecordError(f"Dataset {ref} is still present in one or more Datastores.") from err 

964 

965 @transactional 

966 def attachComponent(self, name: str, parent: DatasetRef, component: DatasetRef): 

967 """Attach a component to a dataset. 

968 

969 Parameters 

970 ---------- 

971 name : `str` 

972 Name of the component. 

973 parent : `DatasetRef` 

974 A reference to the parent dataset. Will be updated to reference 

975 the component. 

976 component : `DatasetRef` 

977 A reference to the component dataset. 

978 

979 Raises 

980 ------ 

981 AmbiguousDatasetError 

982 Raised if ``parent.id`` or ``component.id`` is `None`. 

983 """ 

984 # TODO Insert check for component name and type against 

985 # parent.storageClass specified components 

986 if parent.id is None: 

987 raise AmbiguousDatasetError(f"Cannot attach component to dataset {parent} without ID.") 

988 if component.id is None: 

989 raise AmbiguousDatasetError(f"Cannot attach component {component} without ID.") 

990 values = dict(component_name=name, 

991 parent_dataset_id=parent.id, 

992 component_dataset_id=component.id) 

993 self._db.insert(self._tables.dataset_composition, values) 

994 parent._components[name] = component 

995 

996 @transactional 

997 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

998 """Add existing Datasets to a collection, implicitly creating the 

999 collection if it does not already exist. 

1000 

1001 If a DatasetRef with the same exact ``dataset_id`` is already in a 

1002 collection nothing is changed. If a `DatasetRef` with the same 

1003 `DatasetType1` and dimension values but with different ``dataset_id`` 

1004 exists in the collection, `ValueError` is raised. 

1005 

1006 Parameters 

1007 ---------- 

1008 collection : `str` 

1009 Indicates the collection the Datasets should be associated with. 

1010 refs : iterable of `DatasetRef` 

1011 An iterable of resolved `DatasetRef` instances that already exist 

1012 in this `Registry`. 

1013 recursive : `bool`, optional 

1014 If `True`, associate all component datasets as well. Note that 

1015 this only associates components that are actually included in the 

1016 given `DatasetRef` instances, which may not be the same as those in 

1017 the database (especially if they were obtained from 

1018 `queryDatasets`, which does not populate `DatasetRef.components`). 

1019 

1020 Raises 

1021 ------ 

1022 ConflictingDefinitionError 

1023 If a Dataset with the given `DatasetRef` already exists in the 

1024 given collection. 

1025 AmbiguousDatasetError 

1026 Raised if ``any(ref.id is None for ref in refs)``. 

1027 MissingCollectionError 

1028 Raised if ``collection`` does not exist in the registry. 

1029 TypeError 

1030 Raise adding new datasets to the given ``collection`` is not 

1031 allowed. 

1032 """ 

1033 collectionRecord = self._collections.find(collection) 

1034 if collectionRecord.type is not CollectionType.TAGGED: 

1035 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

1036 if recursive: 

1037 refs = DatasetRef.flatten(refs) 

1038 rows = [{"dataset_id": _checkAndGetId(ref), 

1039 "dataset_ref_hash": ref.hash, 

1040 self._collections.getCollectionForeignKeyName(): collectionRecord.key} 

1041 for ref in refs] 

1042 try: 

1043 self._db.replace(self._tables.dataset_collection, *rows) 

1044 except sqlalchemy.exc.IntegrityError as err: 

1045 raise ConflictingDefinitionError( 

1046 f"Constraint violation while associating datasets with collection {collection}. " 

1047 f"This probably means that one or more datasets with the same dataset type and data ID " 

1048 f"already exist in the collection, but it may also indicate that the datasets do not exist." 

1049 ) from err 

1050 

1051 @transactional 

1052 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

1053 """Remove existing Datasets from a collection. 

1054 

1055 ``collection`` and ``ref`` combinations that are not currently 

1056 associated are silently ignored. 

1057 

1058 Parameters 

1059 ---------- 

1060 collection : `str` 

1061 The collection the Datasets should no longer be associated with. 

1062 refs : iterable of `DatasetRef` 

1063 An iterable of resolved `DatasetRef` instances that already exist 

1064 in this `Registry`. 

1065 recursive : `bool`, optional 

1066 If `True`, disassociate all component datasets as well. Note that 

1067 this only disassociates components that are actually included in 

1068 the given `DatasetRef` instances, which may not be the same as 

1069 those in the database (especially if they were obtained from 

1070 `queryDatasets`, which does not populate `DatasetRef.components`). 

1071 

1072 Raises 

1073 ------ 

1074 AmbiguousDatasetError 

1075 Raised if ``any(ref.id is None for ref in refs)``. 

1076 MissingCollectionError 

1077 Raised if ``collection`` does not exist in the registry. 

1078 TypeError 

1079 Raise adding new datasets to the given ``collection`` is not 

1080 allowed. 

1081 """ 

1082 collectionFieldName = self._collections.getCollectionForeignKeyName() 

1083 collectionRecord = self._collections.find(collection) 

1084 if collectionRecord.type is not CollectionType.TAGGED: 

1085 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

1086 "expected TAGGED.") 

1087 if recursive: 

1088 refs = DatasetRef.flatten(refs) 

1089 rows = [{"dataset_id": _checkAndGetId(ref), collectionFieldName: collectionRecord.key} 

1090 for ref in refs] 

1091 self._db.delete(self._tables.dataset_collection, ["dataset_id", collectionFieldName], *rows) 

1092 

1093 @transactional 

1094 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

1095 """Record that a datastore holds the given datasets. 

1096 

1097 Typically used by `Datastore`. 

1098 

1099 Parameters 

1100 ---------- 

1101 datastoreName : `str` 

1102 Name of the datastore holding these datasets. 

1103 refs : `~collections.abc.Iterable` of `DatasetRef` 

1104 References to the datasets. 

1105 

1106 Raises 

1107 ------ 

1108 AmbiguousDatasetError 

1109 Raised if ``any(ref.id is None for ref in refs)``. 

1110 """ 

1111 self._db.insert( 

1112 self._tables.dataset_location, 

1113 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs] 

1114 ) 

1115 

1116 @transactional 

1117 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]): 

1118 """Move the dataset location information to trash. 

1119 

1120 Parameters 

1121 ---------- 

1122 datastoreName : `str` 

1123 Name of the datastore holding these datasets. 

1124 refs : `~collections.abc.Iterable` of `DatasetRef` 

1125 References to the datasets. 

1126 """ 

1127 # We only want to move rows that already exist in the main table 

1128 filtered = self.checkDatasetLocations(datastoreName, refs) 

1129 self.canDeleteDatasetLocations(datastoreName, filtered) 

1130 self.removeDatasetLocation(datastoreName, filtered) 

1131 

1132 @transactional 

1133 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

1134 """Record that a datastore can delete this dataset 

1135 

1136 Parameters 

1137 ---------- 

1138 datastoreName : `str` 

1139 Name of the datastore holding these datasets. 

1140 refs : `~collections.abc.Iterable` of `DatasetRef` 

1141 References to the datasets. 

1142 

1143 Raises 

1144 ------ 

1145 AmbiguousDatasetError 

1146 Raised if ``any(ref.id is None for ref in refs)``. 

1147 """ 

1148 self._db.insert( 

1149 self._tables.dataset_location_trash, 

1150 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs] 

1151 ) 

1152 

1153 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]: 

1154 """Check which refs are listed for this datastore. 

1155 

1156 Parameters 

1157 ---------- 

1158 datastoreName : `str` 

1159 Name of the datastore holding these datasets. 

1160 refs : `~collections.abc.Iterable` of `DatasetRef` 

1161 References to the datasets. 

1162 

1163 Returns 

1164 ------- 

1165 present : `list` of `DatasetRef` 

1166 All the `DatasetRef` that are listed. 

1167 """ 

1168 

1169 table = self._tables.dataset_location 

1170 result = self._db.query( 

1171 sqlalchemy.sql.select( 

1172 [table.columns.datastore_name, table.columns.dataset_id] 

1173 ).where( 

1174 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]), 

1175 table.columns.datastore_name == datastoreName) 

1176 ) 

1177 ).fetchall() 

1178 

1179 matched_ids = {r["dataset_id"] for r in result} 

1180 return [ref for ref in refs if ref.id in matched_ids] 

1181 

1182 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]: 

1183 """Retrieve datastore locations for a given dataset. 

1184 

1185 Typically used by `Datastore`. 

1186 

1187 Parameters 

1188 ---------- 

1189 ref : `DatasetRef` 

1190 A reference to the dataset for which to retrieve storage 

1191 information. 

1192 

1193 Returns 

1194 ------- 

1195 datastores : `set` of `str` 

1196 All the matching datastores holding this dataset. Empty set 

1197 if the dataset does not exist anywhere. 

1198 

1199 Raises 

1200 ------ 

1201 AmbiguousDatasetError 

1202 Raised if ``ref.id`` is `None`. 

1203 """ 

1204 table = self._tables.dataset_location 

1205 result = self._db.query( 

1206 sqlalchemy.sql.select( 

1207 [table.columns.datastore_name] 

1208 ).where( 

1209 table.columns.dataset_id == ref.id 

1210 ) 

1211 ).fetchall() 

1212 return {r["datastore_name"] for r in result} 

1213 

1214 @transactional 

1215 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]: 

1216 """Retrieve all the dataset ref IDs that are in the trash 

1217 associated with the specified datastore. 

1218 

1219 Parameters 

1220 ---------- 

1221 datastoreName : `str` 

1222 The relevant datastore name to use. 

1223 

1224 Returns 

1225 ------- 

1226 ids : `set` of `FakeDatasetRef` 

1227 The IDs of datasets that can be safely removed from this datastore. 

1228 Can be empty. 

1229 """ 

1230 table = self._tables.dataset_location_trash 

1231 result = self._db.query( 

1232 sqlalchemy.sql.select( 

1233 [table.columns.dataset_id] 

1234 ).where( 

1235 table.columns.datastore_name == datastoreName 

1236 ) 

1237 ).fetchall() 

1238 return {FakeDatasetRef(r["dataset_id"]) for r in result} 

1239 

1240 @transactional 

1241 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None: 

1242 """Remove datastore location associated with these datasets from trash. 

1243 

1244 Typically used by `Datastore` when a dataset is removed. 

1245 

1246 Parameters 

1247 ---------- 

1248 datastoreName : `str` 

1249 Name of this `Datastore`. 

1250 refs : iterable of `FakeDatasetRef` 

1251 The dataset IDs to be removed. 

1252 

1253 Raises 

1254 ------ 

1255 AmbiguousDatasetError 

1256 Raised if ``ref.id`` is `None`. 

1257 """ 

1258 if not refs: 

1259 return 

1260 self._db.delete( 

1261 self._tables.dataset_location_trash, 

1262 ["dataset_id", "datastore_name"], 

1263 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs] 

1264 ) 

1265 

1266 @transactional 

1267 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None: 

1268 """Remove datastore location associated with this dataset. 

1269 

1270 Typically used by `Datastore` when a dataset is removed. 

1271 

1272 Parameters 

1273 ---------- 

1274 datastoreName : `str` 

1275 Name of this `Datastore`. 

1276 refs : iterable of `DatasetRef` 

1277 A reference to the dataset for which information is to be removed. 

1278 

1279 Raises 

1280 ------ 

1281 AmbiguousDatasetError 

1282 Raised if ``ref.id`` is `None`. 

1283 """ 

1284 if not refs: 

1285 return 

1286 self._db.delete( 

1287 self._tables.dataset_location, 

1288 ["dataset_id", "datastore_name"], 

1289 *[{"dataset_id": _checkAndGetId(ref), "datastore_name": datastoreName} for ref in refs] 

1290 ) 

1291 

1292 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1293 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds): 

1294 """Expand a dimension-based data ID to include additional information. 

1295 

1296 Parameters 

1297 ---------- 

1298 dataId : `DataCoordinate` or `dict`, optional 

1299 Data ID to be expanded; augmented and overridden by ``kwds``. 

1300 graph : `DimensionGraph`, optional 

1301 Set of dimensions for the expanded ID. If `None`, the dimensions 

1302 will be inferred from the keys of ``dataId`` and ``kwds``. 

1303 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1304 are silently ignored, providing a way to extract and expand a 

1305 subset of a data ID. 

1306 records : mapping [`DimensionElement`, `DimensionRecord`], optional 

1307 Dimension record data to use before querying the database for that 

1308 data. 

1309 **kwds 

1310 Additional keywords are treated like additional key-value pairs for 

1311 ``dataId``, extending and overriding 

1312 

1313 Returns 

1314 ------- 

1315 expanded : `ExpandedDataCoordinate` 

1316 A data ID that includes full metadata for all of the dimensions it 

1317 identifieds. 

1318 """ 

1319 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds) 

1320 if isinstance(standardized, ExpandedDataCoordinate): 

1321 return standardized 

1322 elif isinstance(dataId, ExpandedDataCoordinate): 

1323 records = dict(records) if records is not None else {} 

1324 records.update(dataId.records) 

1325 else: 

1326 records = dict(records) if records is not None else {} 

1327 keys = dict(standardized) 

1328 regions = [] 

1329 timespans = [] 

1330 for element in standardized.graph.primaryKeyTraversalOrder: 

1331 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1332 if record is ...: 

1333 storage = self._dimensions[element] 

1334 record = storage.fetch(keys) 

1335 records[element] = record 

1336 if record is not None: 

1337 for d in element.implied: 

1338 value = getattr(record, d.name) 

1339 if keys.setdefault(d, value) != value: 

1340 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, " 

1341 f"but {element.name} implies {d.name}={value!r}.") 

1342 if element in standardized.graph.spatial and record.region is not None: 

1343 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions): 

1344 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} " 

1345 f"is disjoint with those for other elements.") 

1346 regions.append(record.region) 

1347 if element in standardized.graph.temporal: 

1348 if any(not record.timespan.overlaps(t) for t in timespans): 

1349 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}" 

1350 f" is disjoint with those for other elements.") 

1351 timespans.append(record.timespan) 

1352 else: 

1353 if element in standardized.graph.required: 

1354 raise LookupError( 

1355 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1356 ) 

1357 if element.alwaysJoin: 

1358 raise InconsistentDataIdError( 

1359 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1360 f"but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1361 f"related." 

1362 ) 

1363 records.update((d, None) for d in element.implied) 

1364 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

1365 

1366 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]: 

1367 """Compare the keys and values of a pair of data IDs for consistency. 

1368 

1369 See `ConsistentDataIds` for more information. 

1370 

1371 Parameters 

1372 ---------- 

1373 a : `dict` or `DataCoordinate` 

1374 First data ID to be compared. 

1375 b : `dict` or `DataCoordinate` 

1376 Second data ID to be compared. 

1377 

1378 Returns 

1379 ------- 

1380 relationship : `ConsistentDataIds` or `None` 

1381 Relationship information. This is not `None` and coerces to 

1382 `True` in boolean contexts if and only if the data IDs are 

1383 consistent in terms of all common key-value pairs, all many-to-many 

1384 join tables, and all spatial andtemporal relationships. 

1385 """ 

1386 a = DataCoordinate.standardize(a, universe=self.dimensions) 

1387 b = DataCoordinate.standardize(b, universe=self.dimensions) 

1388 aFull = getattr(a, "full", None) 

1389 bFull = getattr(b, "full", None) 

1390 aBest = aFull if aFull is not None else a 

1391 bBest = bFull if bFull is not None else b 

1392 jointKeys = aBest.keys() & bBest.keys() 

1393 # If any common values are not equal, we know they are inconsistent. 

1394 if any(aBest[k] != bBest[k] for k in jointKeys): 

1395 return None 

1396 # If the graphs are equal, we know the data IDs are. 

1397 if a.graph == b.graph: 

1398 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys)) 

1399 # Result is still inconclusive. Try to expand a data ID containing 

1400 # keys from both; that will fail if they are inconsistent. 

1401 # First, if either input was already an ExpandedDataCoordinate, extract 

1402 # its records so we don't have to query for them. 

1403 records = {} 

1404 if hasattr(a, "records"): 

1405 records.update(a.records) 

1406 if hasattr(b, "records"): 

1407 records.update(b.records) 

1408 try: 

1409 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records) 

1410 except InconsistentDataIdError: 

1411 return None 

1412 # We know the answer is not `None`; time to figure out what it is. 

1413 return ConsistentDataIds( 

1414 contains=(a.graph >= b.graph), 

1415 within=(a.graph <= b.graph), 

1416 overlaps=bool(a.graph & b.graph), 

1417 ) 

1418 

1419 def insertDimensionData(self, element: Union[DimensionElement, str], 

1420 *data: Union[dict, DimensionRecord], 

1421 conform: bool = True): 

1422 """Insert one or more dimension records into the database. 

1423 

1424 Parameters 

1425 ---------- 

1426 element : `DimensionElement` or `str` 

1427 The `DimensionElement` or name thereof that identifies the table 

1428 records will be inserted into. 

1429 data : `dict` or `DimensionRecord` (variadic) 

1430 One or more records to insert. 

1431 conform : `bool`, optional 

1432 If `False` (`True` is default) perform no checking or conversions, 

1433 and assume that ``element`` is a `DimensionElement` instance and 

1434 ``data`` is a one or more `DimensionRecord` instances of the 

1435 appropriate subclass. 

1436 """ 

1437 if conform: 

1438 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1439 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1440 for row in data] 

1441 else: 

1442 records = data 

1443 storage = self._dimensions[element] 

1444 storage.insert(*records) 

1445 

1446 def syncDimensionData(self, element: Union[DimensionElement, str], 

1447 row: Union[dict, DimensionRecord], 

1448 conform: bool = True) -> bool: 

1449 """Synchronize the given dimension record with the database, inserting 

1450 if it does not already exist and comparing values if it does. 

1451 

1452 Parameters 

1453 ---------- 

1454 element : `DimensionElement` or `str` 

1455 The `DimensionElement` or name thereof that identifies the table 

1456 records will be inserted into. 

1457 row : `dict` or `DimensionRecord` 

1458 The record to insert. 

1459 conform : `bool`, optional 

1460 If `False` (`True` is default) perform no checking or conversions, 

1461 and assume that ``element`` is a `DimensionElement` instance and 

1462 ``data`` is a one or more `DimensionRecord` instances of the 

1463 appropriate subclass. 

1464 

1465 Returns 

1466 ------- 

1467 inserted : `bool` 

1468 `True` if a new row was inserted, `False` otherwise. 

1469 

1470 Raises 

1471 ------ 

1472 ConflictingDefinitionError 

1473 Raised if the record exists in the database (according to primary 

1474 key lookup) but is inconsistent with the given one. 

1475 

1476 Notes 

1477 ----- 

1478 This method cannot be called within transactions, as it needs to be 

1479 able to perform its own transaction to be concurrent. 

1480 """ 

1481 if conform: 

1482 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1483 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1484 else: 

1485 record = row 

1486 storage = self._dimensions[element] 

1487 try: 

1488 return storage.sync(record) 

1489 except DatabaseConflictError as err: 

1490 raise ConflictingDefinitionError(str(err)) from err 

1491 

1492 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]: 

1493 """Iterate over the dataset types whose names match an expression. 

1494 

1495 Parameters 

1496 ---------- 

1497 expression : `Any`, optional 

1498 An expression that fully or partially identifies the dataset types 

1499 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1500 `...` can be used to return all dataset types, and is the default. 

1501 See :ref:`daf_butler_dataset_type_expressions` for more 

1502 information. 

1503 

1504 Yields 

1505 ------ 

1506 datasetType : `DatasetType` 

1507 A `DatasetType` instance whose name matches ``expression``. 

1508 """ 

1509 yield from self._datasetStorage.fetchDatasetTypes(expression) 

1510 

1511 def queryCollections(self, expression: Any = ..., 

1512 datasetType: Optional[DatasetType] = None, 

1513 collectionType: Optional[CollectionType] = None, 

1514 flattenChains: bool = False, 

1515 includeChains: Optional[bool] = None) -> Iterator[str]: 

1516 """Iterate over the collections whose names match an expression. 

1517 

1518 Parameters 

1519 ---------- 

1520 expression : `Any`, optional 

1521 An expression that fully or partially identifies the collections 

1522 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1523 `...` can be used to return all collections, and is the default. 

1524 See :ref:`daf_butler_collection_expressions` for more 

1525 information. 

1526 datasetType : `DatasetType`, optional 

1527 If provided, only yield collections that should be searched for 

1528 this dataset type according to ``expression``. If this is 

1529 not provided, any dataset type restrictions in ``expression`` are 

1530 ignored. 

1531 collectionType : `CollectionType`, optional 

1532 If provided, only yield collections of this type. 

1533 flattenChains : `bool`, optional 

1534 If `True` (`False` is default), recursively yield the child 

1535 collections of matching `~CollectionType.CHAINED` collections. 

1536 includeChains : `bool`, optional 

1537 If `True`, yield records for matching `~CollectionType.CHAINED` 

1538 collections. Default is the opposite of ``flattenChains``: include 

1539 either CHAINED collections or their children, but not both. 

1540 

1541 Yields 

1542 ------ 

1543 collection : `str` 

1544 The name of a collection that matches ``expression``. 

1545 """ 

1546 query = CollectionQuery.fromExpression(expression) 

1547 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1548 flattenChains=flattenChains, includeChains=includeChains): 

1549 yield record.name 

1550 

1551 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1552 """Return a `QueryBuilder` instance capable of constructing and 

1553 managing more complex queries than those obtainable via `Registry` 

1554 interfaces. 

1555 

1556 This is an advanced interface; downstream code should prefer 

1557 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1558 are sufficient. 

1559 

1560 Parameters 

1561 ---------- 

1562 summary : `QuerySummary` 

1563 Object describing and categorizing the full set of dimensions that 

1564 will be included in the query. 

1565 

1566 Returns 

1567 ------- 

1568 builder : `QueryBuilder` 

1569 Object that can be used to construct and perform advanced queries. 

1570 """ 

1571 return QueryBuilder(connection=self._connection, summary=summary, 

1572 dimensionStorage=self._dimensions, 

1573 datasetStorage=self._datasetStorage) 

1574 

1575 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1576 dataId: Optional[DataId] = None, 

1577 datasets: Any = None, 

1578 collections: Any = None, 

1579 where: Optional[str] = None, 

1580 expand: bool = True, 

1581 **kwds) -> Iterator[DataCoordinate]: 

1582 """Query for and iterate over data IDs matching user-provided criteria. 

1583 

1584 Parameters 

1585 ---------- 

1586 dimensions : `Dimension` or `str`, or iterable thereof 

1587 The dimensions of the data IDs to yield, as either `Dimension` 

1588 instances or `str`. Will be automatically expanded to a complete 

1589 `DimensionGraph`. 

1590 dataId : `dict` or `DataCoordinate`, optional 

1591 A data ID whose key-value pairs are used as equality constraints 

1592 in the query. 

1593 datasets : `Any`, optional 

1594 An expression that fully or partially identifies dataset types 

1595 that should constrain the yielded data IDs. For example, including 

1596 "raw" here would constrain the yielded ``instrument``, 

1597 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1598 those for which at least one "raw" dataset exists in 

1599 ``collections``. Allowed types include `DatasetType`, `str`, 

1600 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1601 expressions, `...` is not permitted - it doesn't make sense to 

1602 constrain data IDs on the existence of *all* datasets. 

1603 See :ref:`daf_butler_dataset_type_expressions` for more 

1604 information. 

1605 collections: `Any`, optional 

1606 An expression that fully or partially identifies the collections 

1607 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1608 thereof. `...` can be used to return all collections. Must be 

1609 provided if ``datasets`` is, and is ignored if it is not. See 

1610 :ref:`daf_butler_collection_expressions` for more information. 

1611 where : `str`, optional 

1612 A string expression similar to a SQL WHERE clause. May involve 

1613 any column of a dimension table or (as a shortcut for the primary 

1614 key column of a dimension table) dimension name. See 

1615 :ref:`daf_butler_dimension_expressions` for more information. 

1616 expand : `bool`, optional 

1617 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1618 minimal `DataCoordinate` base-class instances. 

1619 kwds 

1620 Additional keyword arguments are forwarded to 

1621 `DataCoordinate.standardize` when processing the ``dataId`` 

1622 argument (and may be used to provide a constraining data ID even 

1623 when the ``dataId`` argument is `None`). 

1624 

1625 Yields 

1626 ------ 

1627 dataId : `DataCoordinate` 

1628 Data IDs matching the given query parameters. Order is 

1629 unspecified. 

1630 """ 

1631 dimensions = iterable(dimensions) 

1632 standardizedDataId = self.expandDataId(dataId, **kwds) 

1633 standardizedDatasetTypes = [] 

1634 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1635 if datasets is not None: 

1636 if collections is None: 

1637 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1638 for datasetType in self._datasetStorage.fetchDatasetTypes(datasets): 

1639 requestedDimensionNames.update(datasetType.dimensions.names) 

1640 standardizedDatasetTypes.append(datasetType) 

1641 # Preprocess collections expression in case the original included 

1642 # single-pass iterators (we'll want to use it multiple times 

1643 # below). 

1644 collections = CollectionQuery.fromExpression(collections) 

1645 

1646 summary = QuerySummary( 

1647 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1648 dataId=standardizedDataId, 

1649 expression=where, 

1650 ) 

1651 builder = self.makeQueryBuilder(summary) 

1652 for datasetType in standardizedDatasetTypes: 

1653 builder.joinDataset(datasetType, collections, isResult=False) 

1654 query = builder.finish() 

1655 predicate = query.predicate() 

1656 for row in query.execute(): 

1657 if predicate(row): 

1658 result = query.extractDataId(row) 

1659 if expand: 

1660 yield self.expandDataId(result, records=standardizedDataId.records) 

1661 else: 

1662 yield result 

1663 

1664 def queryDatasets(self, datasetType: Any, *, 

1665 collections: Any, 

1666 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1667 dataId: Optional[DataId] = None, 

1668 where: Optional[str] = None, 

1669 deduplicate: bool = False, 

1670 expand: bool = True, 

1671 **kwds) -> Iterator[DatasetRef]: 

1672 """Query for and iterate over dataset references matching user-provided 

1673 criteria. 

1674 

1675 Parameters 

1676 ---------- 

1677 datasetType 

1678 An expression that fully or partially identifies the dataset types 

1679 to be queried. Allowed types include `DatasetType`, `str`, 

1680 `re.Pattern`, and iterables thereof. The special value `...` can 

1681 be used to query all dataset types. See 

1682 :ref:`daf_butler_dataset_type_expressions` for more information. 

1683 collections 

1684 An expression that fully or partially identifies the collections 

1685 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1686 thereof. `...` can be used to return all collections. See 

1687 :ref:`daf_butler_collection_expressions` for more information. 

1688 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1689 Dimensions to include in the query (in addition to those used 

1690 to identify the queried dataset type(s)), either to constrain 

1691 the resulting datasets to those for which a matching dimension 

1692 exists, or to relate the dataset type's dimensions to dimensions 

1693 referenced by the ``dataId`` or ``where`` arguments. 

1694 dataId : `dict` or `DataCoordinate`, optional 

1695 A data ID whose key-value pairs are used as equality constraints 

1696 in the query. 

1697 where : `str`, optional 

1698 A string expression similar to a SQL WHERE clause. May involve 

1699 any column of a dimension table or (as a shortcut for the primary 

1700 key column of a dimension table) dimension name. See 

1701 :ref:`daf_butler_dimension_expressions` for more information. 

1702 deduplicate : `bool`, optional 

1703 If `True` (`False` is default), for each result data ID, only 

1704 yield one `DatasetRef` of each `DatasetType`, from the first 

1705 collection in which a dataset of that dataset type appears 

1706 (according to the order of ``collections`` passed in). If `True`, 

1707 ``collections`` must not contain regular expressions and may not 

1708 be `...`. 

1709 expand : `bool`, optional 

1710 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1711 minimal `DataCoordinate` base-class instances. 

1712 kwds 

1713 Additional keyword arguments are forwarded to 

1714 `DataCoordinate.standardize` when processing the ``dataId`` 

1715 argument (and may be used to provide a constraining data ID even 

1716 when the ``dataId`` argument is `None`). 

1717 

1718 Yields 

1719 ------ 

1720 ref : `DatasetRef` 

1721 Dataset references matching the given query criteria. These 

1722 are grouped by `DatasetType` if the query evaluates to multiple 

1723 dataset types, but order is otherwise unspecified. 

1724 

1725 Raises 

1726 ------ 

1727 TypeError 

1728 Raised when the arguments are incompatible, such as when a 

1729 collection wildcard is passed when ``deduplicate`` is `True`. 

1730 

1731 Notes 

1732 ----- 

1733 When multiple dataset types are queried in a single call, the 

1734 results of this operation are equivalent to querying for each dataset 

1735 type separately in turn, and no information about the relationships 

1736 between datasets of different types is included. In contexts where 

1737 that kind of information is important, the recommended pattern is to 

1738 use `queryDimensions` to first obtain data IDs (possibly with the 

1739 desired dataset types and collections passed as constraints to the 

1740 query), and then use multiple (generally much simpler) calls to 

1741 `queryDatasets` with the returned data IDs passed as constraints. 

1742 """ 

1743 # Standardize and expand the data ID provided as a constraint. 

1744 standardizedDataId = self.expandDataId(dataId, **kwds) 

1745 # If the datasetType passed isn't actually a DatasetType, expand it 

1746 # (it could be an expression that yields multiple DatasetTypes) and 

1747 # recurse. 

1748 if not isinstance(datasetType, DatasetType): 

1749 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetType): 

1750 yield from self.queryDatasets(trueDatasetType, collections=collections, 

1751 dimensions=dimensions, dataId=standardizedDataId, 

1752 where=where, deduplicate=deduplicate) 

1753 return 

1754 # The full set of dimensions in the query is the combination of those 

1755 # needed for the DatasetType and those explicitly requested, if any. 

1756 requestedDimensionNames = set(datasetType.dimensions.names) 

1757 if dimensions is not None: 

1758 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1759 # Construct the summary structure needed to construct a QueryBuilder. 

1760 summary = QuerySummary( 

1761 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1762 dataId=standardizedDataId, 

1763 expression=where, 

1764 ) 

1765 builder = self.makeQueryBuilder(summary) 

1766 # Add the dataset subquery to the query, telling the QueryBuilder to 

1767 # include the rank of the selected collection in the results only if we 

1768 # need to deduplicate. Note that if any of the collections are 

1769 # actually wildcard expressions, and we've asked for deduplication, 

1770 # this will raise TypeError for us. 

1771 builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate) 

1772 query = builder.finish() 

1773 predicate = query.predicate() 

1774 if not deduplicate or len(collections) == 1: 

1775 # No need to de-duplicate across collections. 

1776 for row in query.execute(): 

1777 if predicate(row): 

1778 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1779 if expand: 

1780 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1781 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1782 else: 

1783 # For each data ID, yield only the DatasetRef with the lowest 

1784 # collection rank. 

1785 bestRefs = {} 

1786 bestRanks = {} 

1787 for row in query.execute(): 

1788 if predicate(row): 

1789 ref, rank = query.extractDatasetRef(row, datasetType) 

1790 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1791 if rank < bestRank: 

1792 bestRefs[ref.dataId] = ref 

1793 bestRanks[ref.dataId] = rank 

1794 # If caller requested expanded data IDs, we defer that until here 

1795 # so we do as little expansion as possible. 

1796 if expand: 

1797 for ref in bestRefs.values(): 

1798 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1799 yield ref.expanded(dataId) 

1800 else: 

1801 yield from bestRefs.values() 

1802 

1803 dimensions: DimensionUniverse 

1804 """The universe of all dimensions known to the registry 

1805 (`DimensionUniverse`). 

1806 """ 

1807 

1808 storageClasses: StorageClassFactory 

1809 """All storage classes known to the registry (`StorageClassFactory`). 

1810 """