Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "AmbiguousDatasetError", 

26 "ConflictingDefinitionError", 

27 "ConsistentDataIds", 

28 "InconsistentDataIdError", 

29 "OrphanedRecordError", 

30 "Registry", 

31) 

32 

33import contextlib 

34from dataclasses import dataclass 

35import sys 

36from typing import ( 

37 Any, 

38 Iterable, 

39 Iterator, 

40 List, 

41 Mapping, 

42 Optional, 

43 Set, 

44 Type, 

45 TYPE_CHECKING, 

46 Union, 

47) 

48 

49import sqlalchemy 

50 

51import lsst.sphgeom 

52from ..core import ( 

53 Config, 

54 DataCoordinate, 

55 DataId, 

56 DatasetRef, 

57 DatasetType, 

58 Dimension, 

59 DimensionElement, 

60 DimensionGraph, 

61 DimensionRecord, 

62 DimensionUniverse, 

63 ExpandedDataCoordinate, 

64 FakeDatasetRef, 

65 StorageClassFactory, 

66) 

67from ..core import ddl 

68from ..core.utils import doImport, iterable, transactional 

69from ._config import RegistryConfig 

70from .queries import ( 

71 DatasetRegistryStorage, 

72 QueryBuilder, 

73 QuerySummary, 

74) 

75from .tables import makeRegistryTableSpecs 

76from ._collectionType import CollectionType 

77from .wildcards import CollectionQuery, CollectionSearch 

78from .interfaces import DatabaseConflictError 

79 

80if TYPE_CHECKING: 80 ↛ 81line 80 didn't jump to line 81, because the condition on line 80 was never true

81 from ..butlerConfig import ButlerConfig 

82 from ..core import ( 

83 Quantum 

84 ) 

85 from .interfaces import ( 

86 CollectionManager, 

87 Database, 

88 OpaqueTableStorageManager, 

89 DimensionRecordStorageManager, 

90 ) 

91 

92 

93@dataclass 

94class ConsistentDataIds: 

95 """A struct used to report relationships between data IDs by 

96 `Registry.relateDataIds`. 

97 

98 If an instance of this class is returned (instead of `None`), the data IDs 

99 are "not inconsistent" - any keys they have in common have the same value, 

100 and any spatial or temporal relationships they have at least might involve 

101 an overlap. To capture this, any instance of `ConsistentDataIds` coerces 

102 to `True` in boolean contexts. 

103 """ 

104 

105 overlaps: bool 

106 """If `True`, the data IDs have at least one key in common, associated with 

107 the same value. 

108 

109 Note that data IDs are not inconsistent even if overlaps is `False` - they 

110 may simply have no keys in common, which means they cannot have 

111 inconsistent values for any keys. They may even be equal, in the case that 

112 both data IDs are empty. 

113 

114 This field does _not_ indicate whether a spatial or temporal overlap 

115 relationship exists. 

116 """ 

117 

118 contains: bool 

119 """If `True`, all keys in the first data ID are in the second, and are 

120 associated with the same values. 

121 

122 This includes case where the first data ID is empty. 

123 """ 

124 

125 within: bool 

126 """If `True`, all keys in the second data ID are in the first, and are 

127 associated with the same values. 

128 

129 This includes case where the second data ID is empty. 

130 """ 

131 

132 @property 

133 def equal(self) -> bool: 

134 """If `True`, the two data IDs are the same. 

135 

136 Data IDs are equal if they have both a `contains` and a `within` 

137 relationship. 

138 """ 

139 return self.contains and self.within 

140 

141 @property 

142 def disjoint(self) -> bool: 

143 """If `True`, the two data IDs have no keys in common. 

144 

145 This is simply the oppose of `overlaps`. Disjoint datasets are by 

146 definition not inconsistent. 

147 """ 

148 return not self.overlaps 

149 

150 def __bool__(self) -> bool: 

151 return True 

152 

153 

154class InconsistentDataIdError(ValueError): 

155 """Exception raised when a data ID contains contradictory key-value pairs, 

156 according to dimension relationships. 

157 

158 This can include the case where the data ID identifies mulitple spatial 

159 regions or timspans that are disjoint. 

160 """ 

161 

162 

163class AmbiguousDatasetError(Exception): 

164 """Exception raised when a `DatasetRef` has no ID and a `Registry` 

165 operation requires one. 

166 """ 

167 

168 

169class ConflictingDefinitionError(Exception): 

170 """Exception raised when trying to insert a database record when a 

171 conflicting record already exists. 

172 """ 

173 

174 

175class OrphanedRecordError(Exception): 

176 """Exception raised when trying to remove or modify a database record 

177 that is still being used in some other table. 

178 """ 

179 

180 

181def _checkAndGetId(ref: DatasetRef) -> int: 

182 """Return the ID of the given `DatasetRef`, or raise if it is `None`. 

183 

184 This trivial function exists to allow operations that would otherwise be 

185 natural list comprehensions to check that the ID is not `None` as well. 

186 

187 Parameters 

188 ---------- 

189 ref : `DatasetRef` 

190 Dataset reference. 

191 

192 Returns 

193 ------- 

194 id : `int` 

195 ``ref.id`` 

196 

197 Raises 

198 ------ 

199 AmbiguousDatasetError 

200 Raised if ``ref.id`` is `None`. 

201 """ 

202 if ref.id is None: 

203 raise AmbiguousDatasetError("Dataset ID must not be `None`.") 

204 return ref.id 

205 

206 

207class Registry: 

208 """Registry interface. 

209 

210 Parameters 

211 ---------- 

212 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

213 Registry configuration 

214 """ 

215 

216 defaultConfigFile = None 

217 """Path to configuration defaults. Relative to $DAF_BUTLER_DIR/config or 

218 absolute path. Can be None if no defaults specified. 

219 """ 

220 

221 @classmethod 

222 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], create: bool = False, 

223 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

224 """Create `Registry` subclass instance from `config`. 

225 

226 Uses ``registry.cls`` from `config` to determine which subclass to 

227 instantiate. 

228 

229 Parameters 

230 ---------- 

231 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

232 Registry configuration 

233 create : `bool`, optional 

234 Assume empty Registry and create a new one. 

235 butlerRoot : `str`, optional 

236 Path to the repository root this `Registry` will manage. 

237 writeable : `bool`, optional 

238 If `True` (default) create a read-write connection to the database. 

239 

240 Returns 

241 ------- 

242 registry : `Registry` (subclass) 

243 A new `Registry` subclass instance. 

244 """ 

245 if not isinstance(config, RegistryConfig): 

246 if isinstance(config, str) or isinstance(config, Config): 

247 config = RegistryConfig(config) 

248 else: 

249 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

250 config.replaceRoot(butlerRoot) 

251 DatabaseClass = config.getDatabaseClass() 

252 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

253 namespace=config.get("namespace"), writeable=writeable) 

254 universe = DimensionUniverse(config) 

255 opaque = doImport(config["managers", "opaque"]) 

256 dimensions = doImport(config["managers", "dimensions"]) 

257 collections = doImport(config["managers", "collections"]) 

258 return cls(database, universe, dimensions=dimensions, opaque=opaque, collections=collections, 

259 create=create) 

260 

261 def __init__(self, database: Database, universe: DimensionUniverse, *, 

262 opaque: Type[OpaqueTableStorageManager], 

263 dimensions: Type[DimensionRecordStorageManager], 

264 collections: Type[CollectionManager], 

265 create: bool = False): 

266 self._db = database 

267 self.storageClasses = StorageClassFactory() 

268 with self._db.declareStaticTables(create=create) as context: 

269 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

270 self._collections = collections.initialize(self._db, context) 

271 self._tables = context.addTableTuple(makeRegistryTableSpecs(self.dimensions, self._collections)) 

272 self._opaque = opaque.initialize(self._db, context) 

273 self._collections.refresh() 

274 # TODO: we shouldn't be grabbing the private connection from the 

275 # Database instance like this, but it's a reasonable way to proceed 

276 # while we transition to using the Database API more. 

277 self._connection = self._db._connection 

278 self._datasetStorage = DatasetRegistryStorage(connection=self._connection, 

279 universe=self.dimensions, 

280 tables=self._tables._asdict(), 

281 collections=self._collections) 

282 self._datasetTypes = {} 

283 

284 def __str__(self) -> str: 

285 return str(self._db) 

286 

287 def __repr__(self) -> str: 

288 return f"Registry({self._db!r}, {self.dimensions!r})" 

289 

290 def isWriteable(self) -> bool: 

291 """Return `True` if this registry allows write operations, and `False` 

292 otherwise. 

293 """ 

294 return self._db.isWriteable() 

295 

296 @property 

297 def dimensions(self) -> DimensionUniverse: 

298 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

299 """ 

300 return self._dimensions.universe 

301 

302 @contextlib.contextmanager 

303 def transaction(self): 

304 """Return a context manager that represents a transaction. 

305 """ 

306 # TODO make savepoint=False the default. 

307 try: 

308 with self._db.transaction(): 

309 yield 

310 except BaseException: 

311 # TODO: this clears the caches sometimes when we wouldn't actually 

312 # need to. Can we avoid that? 

313 self._dimensions.clearCaches() 

314 self._datasetTypes.clear() 

315 raise 

316 

317 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec): 

318 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

319 other data repository client. 

320 

321 Opaque table records can be added via `insertOpaqueData`, retrieved via 

322 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

323 

324 Parameters 

325 ---------- 

326 tableName : `str` 

327 Logical name of the opaque table. This may differ from the 

328 actual name used in the database by a prefix and/or suffix. 

329 spec : `ddl.TableSpec` 

330 Specification for the table to be added. 

331 """ 

332 self._opaque.register(tableName, spec) 

333 

334 @transactional 

335 def insertOpaqueData(self, tableName: str, *data: dict): 

336 """Insert records into an opaque table. 

337 

338 Parameters 

339 ---------- 

340 tableName : `str` 

341 Logical name of the opaque table. Must match the name used in a 

342 previous call to `registerOpaqueTable`. 

343 data 

344 Each additional positional argument is a dictionary that represents 

345 a single row to be added. 

346 """ 

347 self._opaque[tableName].insert(*data) 

348 

349 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

350 """Retrieve records from an opaque table. 

351 

352 Parameters 

353 ---------- 

354 tableName : `str` 

355 Logical name of the opaque table. Must match the name used in a 

356 previous call to `registerOpaqueTable`. 

357 where 

358 Additional keyword arguments are interpreted as equality 

359 constraints that restrict the returned rows (combined with AND); 

360 keyword arguments are column names and values are the values they 

361 must have. 

362 

363 Yields 

364 ------ 

365 row : `dict` 

366 A dictionary representing a single result row. 

367 """ 

368 yield from self._opaque[tableName].fetch(**where) 

369 

370 @transactional 

371 def deleteOpaqueData(self, tableName: str, **where: Any): 

372 """Remove records from an opaque table. 

373 

374 Parameters 

375 ---------- 

376 tableName : `str` 

377 Logical name of the opaque table. Must match the name used in a 

378 previous call to `registerOpaqueTable`. 

379 where 

380 Additional keyword arguments are interpreted as equality 

381 constraints that restrict the deleted rows (combined with AND); 

382 keyword arguments are column names and values are the values they 

383 must have. 

384 """ 

385 self._opaque[tableName].delete(**where) 

386 

387 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED): 

388 """Add a new collection if one with the given name does not exist. 

389 

390 Parameters 

391 ---------- 

392 name : `str` 

393 The name of the collection to create. 

394 type : `CollectionType` 

395 Enum value indicating the type of collection to create. 

396 

397 Notes 

398 ----- 

399 This method cannot be called within transactions, as it needs to be 

400 able to perform its own transaction to be concurrent. 

401 """ 

402 self._collections.register(name, type) 

403 

404 def getCollectionType(self, name: str) -> CollectionType: 

405 """Return an enumeration value indicating the type of the given 

406 collection. 

407 

408 Parameters 

409 ---------- 

410 name : `str` 

411 The name of the collection. 

412 

413 Returns 

414 ------- 

415 type : `CollectionType` 

416 Enum value indicating the type of this collection. 

417 

418 Raises 

419 ------ 

420 MissingCollectionError 

421 Raised if no collection with the given name exists. 

422 """ 

423 return self._collections.find(name).type 

424 

425 def registerRun(self, name: str): 

426 """Add a new run if one with the given name does not exist. 

427 

428 Parameters 

429 ---------- 

430 name : `str` 

431 The name of the run to create. 

432 

433 Notes 

434 ----- 

435 This method cannot be called within transactions, as it needs to be 

436 able to perform its own transaction to be concurrent. 

437 """ 

438 self._collections.register(name, CollectionType.RUN) 

439 

440 @transactional 

441 def removeCollection(self, name: str): 

442 """Completely remove the given collection. 

443 

444 Parameters 

445 ---------- 

446 name : `str` 

447 The name of the collection to remove. 

448 

449 Raises 

450 ------ 

451 MissingCollectionError 

452 Raised if no collection with the given name exists. 

453 

454 Notes 

455 ----- 

456 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

457 in it are also fully removed. This requires that those datasets be 

458 removed (or at least trashed) from any datastores that hold them first. 

459 

460 A collection may not be deleted as long as it is referenced by a 

461 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

462 be deleted or redefined first. 

463 """ 

464 self._collections.remove(name) 

465 

466 def getCollectionChain(self, parent: str) -> CollectionSearch: 

467 """Return the child collections in a `~CollectionType.CHAINED` 

468 collection. 

469 

470 Parameters 

471 ---------- 

472 parent : `str` 

473 Name of the chained collection. Must have already been added via 

474 a call to `Registry.registerCollection`. 

475 

476 Returns 

477 ------- 

478 children : `CollectionSearch` 

479 An object that defines the search path of the collection. 

480 See :ref:`daf_butler_collection_expressions` for more information. 

481 

482 Raises 

483 ------ 

484 MissingCollectionError 

485 Raised if ``parent`` does not exist in the `Registry`. 

486 TypeError 

487 Raised if ``parent`` does not correspond to a 

488 `~CollectionType.CHAINED` collection. 

489 """ 

490 record = self._collections.find(parent) 

491 if record.type is not CollectionType.CHAINED: 

492 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

493 return record.children 

494 

495 @transactional 

496 def setCollectionChain(self, parent: str, children: Any): 

497 """Define or redefine a `~CollectionType.CHAINED` collection. 

498 

499 Parameters 

500 ---------- 

501 parent : `str` 

502 Name of the chained collection. Must have already been added via 

503 a call to `Registry.registerCollection`. 

504 children : `Any` 

505 An expression defining an ordered search of child collections, 

506 generally an iterable of `str`. Restrictions on the dataset types 

507 to be searched can also be included, by passing mapping or an 

508 iterable containing tuples; see 

509 :ref:`daf_butler_collection_expressions` for more information. 

510 

511 Raises 

512 ------ 

513 MissingCollectionError 

514 Raised when any of the given collections do not exist in the 

515 `Registry`. 

516 TypeError 

517 Raised if ``parent`` does not correspond to a 

518 `~CollectionType.CHAINED` collection. 

519 ValueError 

520 Raised if the given collections contains a cycle. 

521 """ 

522 record = self._collections.find(parent) 

523 if record.type is not CollectionType.CHAINED: 

524 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

525 children = CollectionSearch.fromExpression(children) 

526 if children != record.children: 

527 record.update(self._collections, children) 

528 

529 @transactional 

530 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

531 """ 

532 Add a new `DatasetType` to the Registry. 

533 

534 It is not an error to register the same `DatasetType` twice. 

535 

536 Parameters 

537 ---------- 

538 datasetType : `DatasetType` 

539 The `DatasetType` to be added. 

540 

541 Returns 

542 ------- 

543 inserted : `bool` 

544 `True` if ``datasetType`` was inserted, `False` if an identical 

545 existing `DatsetType` was found. Note that in either case the 

546 DatasetType is guaranteed to be defined in the Registry 

547 consistently with the given definition. 

548 

549 Raises 

550 ------ 

551 ValueError 

552 Raised if the dimensions or storage class are invalid. 

553 ConflictingDefinitionError 

554 Raised if this DatasetType is already registered with a different 

555 definition. 

556 """ 

557 # TODO: this implementation isn't concurrent, except *maybe* in SQLite 

558 # with aggressive locking (where starting a transaction is essentially 

559 # the same as grabbing a full-database lock). Should be reimplemented 

560 # with Database.sync to fix this, but that may require schema changes 

561 # as well so we only have to synchronize one row to know if we have 

562 # inconsistent definitions. 

563 

564 # If the DatasetType is already in the cache, we assume it's already in 

565 # the DB (note that we don't actually provide a way to remove them from 

566 # the DB). 

567 existingDatasetType = self._datasetTypes.get(datasetType.name) 

568 # If it's not in the cache, try to insert it. 

569 if existingDatasetType is None: 

570 try: 

571 with self._db.transaction(): 

572 self._db.insert( 

573 self._tables.dataset_type, 

574 { 

575 "dataset_type_name": datasetType.name, 

576 "storage_class": datasetType.storageClass.name, 

577 } 

578 ) 

579 except sqlalchemy.exc.IntegrityError: 

580 # Insert failed on the only unique constraint on this table: 

581 # dataset_type_name. So now the question is whether the one in 

582 # there is the same as the one we tried to insert. 

583 existingDatasetType = self.getDatasetType(datasetType.name) 

584 else: 

585 # If adding the DatasetType record itself succeeded, add its 

586 # dimensions (if any). We don't guard this in a try block 

587 # because a problem with this insert means the database 

588 # content must be corrupted. 

589 if datasetType.dimensions: 

590 self._db.insert( 

591 self._tables.dataset_type_dimensions, 

592 *[{"dataset_type_name": datasetType.name, 

593 "dimension_name": dimensionName} 

594 for dimensionName in datasetType.dimensions.names] 

595 ) 

596 # Update the cache. 

597 self._datasetTypes[datasetType.name] = datasetType 

598 # Also register component DatasetTypes (if any). 

599 for compName, compStorageClass in datasetType.storageClass.components.items(): 

600 compType = DatasetType(datasetType.componentTypeName(compName), 

601 dimensions=datasetType.dimensions, 

602 storageClass=compStorageClass) 

603 self.registerDatasetType(compType) 

604 # Inserts succeeded, nothing left to do here. 

605 return True 

606 # A DatasetType with this name exists, check if is equal 

607 if datasetType == existingDatasetType: 

608 return False 

609 else: 

610 raise ConflictingDefinitionError(f"DatasetType: {datasetType} != existing {existingDatasetType}") 

611 

612 def getDatasetType(self, name: str) -> DatasetType: 

613 """Get the `DatasetType`. 

614 

615 Parameters 

616 ---------- 

617 name : `str` 

618 Name of the type. 

619 

620 Returns 

621 ------- 

622 type : `DatasetType` 

623 The `DatasetType` associated with the given name. 

624 

625 Raises 

626 ------ 

627 KeyError 

628 Requested named DatasetType could not be found in registry. 

629 """ 

630 datasetType = self._datasetTypes.get(name) 

631 if datasetType is None: 

632 # Get StorageClass from DatasetType table 

633 result = self._db.query( 

634 sqlalchemy.sql.select( 

635 [self._tables.dataset_type.c.storage_class] 

636 ).where( 

637 self._tables.dataset_type.columns.dataset_type_name == name 

638 ) 

639 ).fetchone() 

640 

641 if result is None: 

642 raise KeyError("Could not find entry for datasetType {}".format(name)) 

643 

644 storageClass = self.storageClasses.getStorageClass(result["storage_class"]) 

645 # Get Dimensions (if any) from DatasetTypeDimensions table 

646 result = self._db.query( 

647 sqlalchemy.sql.select( 

648 [self._tables.dataset_type_dimensions.columns.dimension_name] 

649 ).where( 

650 self._tables.dataset_type_dimensions.columns.dataset_type_name == name 

651 ) 

652 ).fetchall() 

653 dimensions = DimensionGraph(self.dimensions, names=(r[0] for r in result) if result else ()) 

654 datasetType = DatasetType(name=name, 

655 storageClass=storageClass, 

656 dimensions=dimensions) 

657 self._datasetTypes[name] = datasetType 

658 return datasetType 

659 

660 def _makeDatasetRefFromRow(self, row: sqlalchemy.engine.RowProxy, 

661 datasetType: Optional[DatasetType] = None, 

662 dataId: Optional[DataCoordinate] = None): 

663 """Construct a DatasetRef from the result of a query on the Dataset 

664 table. 

665 

666 Parameters 

667 ---------- 

668 row : `sqlalchemy.engine.RowProxy`. 

669 Row of a query that contains all columns from the `Dataset` table. 

670 May include additional fields (which will be ignored). 

671 datasetType : `DatasetType`, optional 

672 `DatasetType` associated with this dataset. Will be retrieved 

673 if not provided. If provided, the caller guarantees that it is 

674 already consistent with what would have been retrieved from the 

675 database. 

676 dataId : `DataCoordinate`, optional 

677 Dimensions associated with this dataset. Will be retrieved if not 

678 provided. If provided, the caller guarantees that it is already 

679 consistent with what would have been retrieved from the database. 

680 

681 Returns 

682 ------- 

683 ref : `DatasetRef`. 

684 A new `DatasetRef` instance. 

685 """ 

686 if datasetType is None: 

687 datasetType = self.getDatasetType(row["dataset_type_name"]) 

688 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]] 

689 assert runRecord is not None, "Should be guaranteed by foreign key constraints." 

690 run = runRecord.name 

691 datasetRefHash = row["dataset_ref_hash"] 

692 if dataId is None: 

693 # TODO: should we expand here? 

694 dataId = DataCoordinate.standardize( 

695 row, 

696 graph=datasetType.dimensions, 

697 universe=self.dimensions 

698 ) 

699 # Get components (if present) 

700 components = {} 

701 if datasetType.storageClass.isComposite(): 

702 t = self._tables 

703 columns = list(t.dataset.columns) 

704 columns.append(t.dataset_composition.columns.component_name) 

705 results = self._db.query( 

706 sqlalchemy.sql.select( 

707 columns 

708 ).select_from( 

709 t.dataset.join( 

710 t.dataset_composition, 

711 (t.dataset.columns.dataset_id == t.dataset_composition.columns.component_dataset_id) 

712 ) 

713 ).where( 

714 t.dataset_composition.columns.parent_dataset_id == row["dataset_id"] 

715 ) 

716 ).fetchall() 

717 for result in results: 

718 componentName = result["component_name"] 

719 componentDatasetType = DatasetType( 

720 DatasetType.nameWithComponent(datasetType.name, componentName), 

721 dimensions=datasetType.dimensions, 

722 storageClass=datasetType.storageClass.components[componentName] 

723 ) 

724 components[componentName] = self._makeDatasetRefFromRow(result, dataId=dataId, 

725 datasetType=componentDatasetType) 

726 if not components.keys() <= datasetType.storageClass.components.keys(): 

727 raise RuntimeError( 

728 f"Inconsistency detected between dataset and storage class definitions: " 

729 f"{datasetType.storageClass.name} has components " 

730 f"{set(datasetType.storageClass.components.keys())}, " 

731 f"but dataset has components {set(components.keys())}" 

732 ) 

733 return DatasetRef(datasetType=datasetType, dataId=dataId, id=row["dataset_id"], run=run, 

734 hash=datasetRefHash, components=components) 

735 

736 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

737 collections: Any, **kwds: Any) -> Optional[DatasetRef]: 

738 """Find a dataset given its `DatasetType` and data ID. 

739 

740 This can be used to obtain a `DatasetRef` that permits the dataset to 

741 be read from a `Datastore`. 

742 

743 Parameters 

744 ---------- 

745 datasetType : `DatasetType` or `str` 

746 A `DatasetType` or the name of one. 

747 dataId : `dict` or `DataCoordinate`, optional 

748 A `dict`-like object containing the `Dimension` links that identify 

749 the dataset within a collection. 

750 collections 

751 An expression that fully or partially identifies the collections 

752 to search for the dataset, such as a `str`, `re.Pattern`, or 

753 iterable thereof. `...` can be used to return all collections. 

754 See :ref:`daf_butler_collection_expressions` for more information. 

755 **kwds 

756 Additional keyword arguments passed to 

757 `DataCoordinate.standardize` to convert ``dataId`` to a true 

758 `DataCoordinate` or augment an existing one. 

759 

760 Returns 

761 ------- 

762 ref : `DatasetRef` 

763 A reference to the dataset, or `None` if no matching Dataset 

764 was found. 

765 

766 Raises 

767 ------ 

768 LookupError 

769 Raised if one or more data ID keys are missing. 

770 MissingCollectionError 

771 Raised if any of ``collections`` does not exist in the registry. 

772 """ 

773 if not isinstance(datasetType, DatasetType): 

774 datasetType = self.getDatasetType(datasetType) 

775 dataId = DataCoordinate.standardize(dataId, graph=datasetType.dimensions, 

776 universe=self.dimensions, **kwds) 

777 collections = CollectionSearch.fromExpression(collections) 

778 for collectionRecord in collections.iter(self._collections, datasetType=datasetType): 

779 if collectionRecord.type is CollectionType.TAGGED: 

780 collectionColumn = \ 

781 self._tables.dataset_collection.columns[self._collections.getCollectionForeignKeyName()] 

782 fromClause = self._tables.dataset.join(self._tables.dataset_collection) 

783 elif collectionRecord.type is CollectionType.RUN: 

784 collectionColumn = self._tables.dataset.columns[self._collections.getRunForeignKeyName()] 

785 fromClause = self._tables.dataset 

786 else: 

787 raise NotImplementedError(f"Unrecognized CollectionType: '{collectionRecord.type}'.") 

788 whereTerms = [ 

789 self._tables.dataset.columns.dataset_type_name == datasetType.name, 

790 collectionColumn == collectionRecord.key, 

791 ] 

792 whereTerms.extend(self._tables.dataset.columns[name] == dataId[name] for name in dataId.keys()) 

793 query = self._tables.dataset.select().select_from( 

794 fromClause 

795 ).where( 

796 sqlalchemy.sql.and_(*whereTerms) 

797 ) 

798 result = self._db.query(query).fetchone() 

799 if result is not None: 

800 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId) 

801 return None 

802 

803 @transactional 

804 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

805 run: str, *, producer: Optional[Quantum] = None, recursive: bool = False 

806 ) -> List[DatasetRef]: 

807 """Insert one or more datasets into the `Registry` 

808 

809 This always adds new datasets; to associate existing datasets with 

810 a new collection, use ``associate``. 

811 

812 Parameters 

813 ---------- 

814 datasetType : `DatasetType` or `str` 

815 A `DatasetType` or the name of one. 

816 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

817 Dimension-based identifiers for the new datasets. 

818 run : `str` 

819 The name of the run that produced the datasets. 

820 producer : `Quantum` 

821 Unit of work that produced the datasets. May be `None` to store 

822 no provenance information, but if present the `Quantum` must 

823 already have been added to the Registry. 

824 recursive : `bool` 

825 If True, recursively add datasets and attach entries for component 

826 datasets as well. 

827 

828 Returns 

829 ------- 

830 refs : `list` of `DatasetRef` 

831 Resolved `DatasetRef` instances for all given data IDs (in the same 

832 order). 

833 

834 Raises 

835 ------ 

836 ConflictingDefinitionError 

837 If a dataset with the same dataset type and data ID as one of those 

838 given already exists in the given collection. 

839 MissingCollectionError 

840 Raised if ``run`` does not exist in the registry. 

841 """ 

842 if not isinstance(datasetType, DatasetType): 

843 datasetType = self.getDatasetType(datasetType) 

844 rows = [] 

845 refs = [] 

846 runRecord = self._collections.find(run) 

847 base = { 

848 "dataset_type_name": datasetType.name, 

849 self._collections.getRunForeignKeyName(): runRecord.key, 

850 "quantum_id": producer.id if producer is not None else None, 

851 } 

852 # Expand data IDs and build both a list of unresolved DatasetRefs 

853 # and a list of dictionary rows for the dataset table. 

854 for dataId in dataIds: 

855 ref = DatasetRef(datasetType, self.expandDataId(dataId, graph=datasetType.dimensions)) 

856 refs.append(ref) 

857 row = dict(base, dataset_ref_hash=ref.hash) 

858 for dimension, value in ref.dataId.full.items(): 

859 row[dimension.name] = value 

860 rows.append(row) 

861 # Actually insert into the dataset table. 

862 try: 

863 datasetIds = self._db.insert(self._tables.dataset, *rows, returnIds=True) 

864 except sqlalchemy.exc.IntegrityError as err: 

865 raise ConflictingDefinitionError( 

866 f"Constraint violation while inserting datasets into run {run}. " 

867 f"This usually means that one or more datasets with the same dataset type and data ID " 

868 f"already exist in the collection, but it may be a foreign key violation." 

869 ) from err 

870 # Resolve the DatasetRefs with the autoincrement IDs we generated. 

871 refs = [ref.resolved(id=datasetId, run=run) for datasetId, ref in zip(datasetIds, refs)] 

872 if recursive and datasetType.isComposite(): 

873 # Insert component rows by recursing, and gather a single big list 

874 # of rows to insert into the dataset_composition table. 

875 compositionRows = [] 

876 for componentName in datasetType.storageClass.components: 

877 componentDatasetType = datasetType.makeComponentDatasetType(componentName) 

878 componentRefs = self.insertDatasets(componentDatasetType, 

879 dataIds=(ref.dataId for ref in refs), 

880 run=run, 

881 producer=producer, 

882 recursive=True) 

883 for parentRef, componentRef in zip(refs, componentRefs): 

884 parentRef._components[componentName] = componentRef 

885 compositionRows.append({ 

886 "parent_dataset_id": parentRef.id, 

887 "component_dataset_id": componentRef.id, 

888 "component_name": componentName, 

889 }) 

890 if compositionRows: 

891 self._db.insert(self._tables.dataset_composition, *compositionRows) 

892 return refs 

893 

894 def getDataset(self, id: int, datasetType: Optional[DatasetType] = None, 

895 dataId: Optional[DataCoordinate] = None) -> Optional[DatasetRef]: 

896 """Retrieve a Dataset entry. 

897 

898 Parameters 

899 ---------- 

900 id : `int` 

901 The unique identifier for the Dataset. 

902 datasetType : `DatasetType`, optional 

903 The `DatasetType` of the dataset to retrieve. This is used to 

904 short-circuit retrieving the `DatasetType`, so if provided, the 

905 caller is guaranteeing that it is what would have been retrieved. 

906 dataId : `DataCoordinate`, optional 

907 A `Dimension`-based identifier for the dataset within a 

908 collection, possibly containing additional metadata. This is used 

909 to short-circuit retrieving the dataId, so if provided, the 

910 caller is guaranteeing that it is what would have been retrieved. 

911 

912 Returns 

913 ------- 

914 ref : `DatasetRef` 

915 A ref to the Dataset, or `None` if no matching Dataset 

916 was found. 

917 """ 

918 result = self._db.query( 

919 self._tables.dataset.select().where( 

920 self._tables.dataset.columns.dataset_id == id 

921 ) 

922 ).fetchone() 

923 if result is None: 

924 return None 

925 return self._makeDatasetRefFromRow(result, datasetType=datasetType, dataId=dataId) 

926 

927 @transactional 

928 def removeDatasets(self, refs: Iterable[DatasetRef], *, recursive: bool = True): 

929 """Remove datasets from the Registry. 

930 

931 The datasets will be removed unconditionally from all collections, and 

932 any `Quantum` that consumed this dataset will instead be marked with 

933 having a NULL input. `Datastore` records will *not* be deleted; the 

934 caller is responsible for ensuring that the dataset has already been 

935 removed from all Datastores. 

936 

937 Parameters 

938 ---------- 

939 refs : `Iterable` of `DatasetRef` 

940 References to the datasets to be removed. Must include a valid 

941 ``id`` attribute, and should be considered invalidated upon return. 

942 recursive : `bool`, optional 

943 If `True`, remove all component datasets as well. Note that 

944 this only removes components that are actually included in the 

945 given `DatasetRef` instances, which may not be the same as those in 

946 the database (especially if they were obtained from 

947 `queryDatasets`, which does not populate `DatasetRef.components`). 

948 

949 Raises 

950 ------ 

951 AmbiguousDatasetError 

952 Raised if any ``ref.id`` is `None`. 

953 OrphanedRecordError 

954 Raised if any dataset is still present in any `Datastore`. 

955 """ 

956 if recursive: 

957 refs = DatasetRef.flatten(refs) 

958 rows = [{"dataset_id": _checkAndGetId(ref)} for ref in refs] 

959 # Remove the dataset records. We rely on ON DELETE clauses to 

960 # take care of other dependencies: 

961 # - ON DELETE CASCADE will remove dataset_composition rows. 

962 # - ON DELETE CASCADE will remove dataset_collection rows. 

963 # - ON DELETE SET NULL will apply to dataset_consumer rows, making it 

964 # clear that the provenance of any quanta that used this dataset as 

965 # an input is now incomplete. 

966 try: 

967 self._db.delete(self._tables.dataset, ["dataset_id"], *rows) 

968 except sqlalchemy.exc.IntegrityError as err: 

969 raise OrphanedRecordError("One or more datasets is still " 

970 "present in one or more Datastores.") from err 

971 

972 @transactional 

973 def attachComponent(self, name: str, parent: DatasetRef, component: DatasetRef): 

974 """Attach a component to a dataset. 

975 

976 Parameters 

977 ---------- 

978 name : `str` 

979 Name of the component. 

980 parent : `DatasetRef` 

981 A reference to the parent dataset. Will be updated to reference 

982 the component. 

983 component : `DatasetRef` 

984 A reference to the component dataset. 

985 

986 Raises 

987 ------ 

988 AmbiguousDatasetError 

989 Raised if ``parent.id`` or ``component.id`` is `None`. 

990 """ 

991 # TODO Insert check for component name and type against 

992 # parent.storageClass specified components 

993 if parent.id is None: 

994 raise AmbiguousDatasetError(f"Cannot attach component to dataset {parent} without ID.") 

995 if component.id is None: 

996 raise AmbiguousDatasetError(f"Cannot attach component {component} without ID.") 

997 values = dict(component_name=name, 

998 parent_dataset_id=parent.id, 

999 component_dataset_id=component.id) 

1000 self._db.insert(self._tables.dataset_composition, values) 

1001 parent._components[name] = component 

1002 

1003 @transactional 

1004 def associate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

1005 """Add existing Datasets to a collection, implicitly creating the 

1006 collection if it does not already exist. 

1007 

1008 If a DatasetRef with the same exact ``dataset_id`` is already in a 

1009 collection nothing is changed. If a `DatasetRef` with the same 

1010 `DatasetType1` and dimension values but with different ``dataset_id`` 

1011 exists in the collection, `ValueError` is raised. 

1012 

1013 Parameters 

1014 ---------- 

1015 collection : `str` 

1016 Indicates the collection the Datasets should be associated with. 

1017 refs : iterable of `DatasetRef` 

1018 An iterable of resolved `DatasetRef` instances that already exist 

1019 in this `Registry`. 

1020 recursive : `bool`, optional 

1021 If `True`, associate all component datasets as well. Note that 

1022 this only associates components that are actually included in the 

1023 given `DatasetRef` instances, which may not be the same as those in 

1024 the database (especially if they were obtained from 

1025 `queryDatasets`, which does not populate `DatasetRef.components`). 

1026 

1027 Raises 

1028 ------ 

1029 ConflictingDefinitionError 

1030 If a Dataset with the given `DatasetRef` already exists in the 

1031 given collection. 

1032 AmbiguousDatasetError 

1033 Raised if ``any(ref.id is None for ref in refs)``. 

1034 MissingCollectionError 

1035 Raised if ``collection`` does not exist in the registry. 

1036 TypeError 

1037 Raise adding new datasets to the given ``collection`` is not 

1038 allowed. 

1039 """ 

1040 collectionRecord = self._collections.find(collection) 

1041 if collectionRecord.type is not CollectionType.TAGGED: 

1042 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

1043 if recursive: 

1044 refs = DatasetRef.flatten(refs) 

1045 rows = [{"dataset_id": _checkAndGetId(ref), 

1046 "dataset_ref_hash": ref.hash, 

1047 self._collections.getCollectionForeignKeyName(): collectionRecord.key} 

1048 for ref in refs] 

1049 try: 

1050 self._db.replace(self._tables.dataset_collection, *rows) 

1051 except sqlalchemy.exc.IntegrityError as err: 

1052 raise ConflictingDefinitionError( 

1053 f"Constraint violation while associating datasets with collection {collection}. " 

1054 f"This probably means that one or more datasets with the same dataset type and data ID " 

1055 f"already exist in the collection, but it may also indicate that the datasets do not exist." 

1056 ) from err 

1057 

1058 @transactional 

1059 def disassociate(self, collection: str, refs: Iterable[DatasetRef], *, recursive: bool = True): 

1060 """Remove existing Datasets from a collection. 

1061 

1062 ``collection`` and ``ref`` combinations that are not currently 

1063 associated are silently ignored. 

1064 

1065 Parameters 

1066 ---------- 

1067 collection : `str` 

1068 The collection the Datasets should no longer be associated with. 

1069 refs : iterable of `DatasetRef` 

1070 An iterable of resolved `DatasetRef` instances that already exist 

1071 in this `Registry`. 

1072 recursive : `bool`, optional 

1073 If `True`, disassociate all component datasets as well. Note that 

1074 this only disassociates components that are actually included in 

1075 the given `DatasetRef` instances, which may not be the same as 

1076 those in the database (especially if they were obtained from 

1077 `queryDatasets`, which does not populate `DatasetRef.components`). 

1078 

1079 Raises 

1080 ------ 

1081 AmbiguousDatasetError 

1082 Raised if ``any(ref.id is None for ref in refs)``. 

1083 MissingCollectionError 

1084 Raised if ``collection`` does not exist in the registry. 

1085 TypeError 

1086 Raise adding new datasets to the given ``collection`` is not 

1087 allowed. 

1088 """ 

1089 collectionFieldName = self._collections.getCollectionForeignKeyName() 

1090 collectionRecord = self._collections.find(collection) 

1091 if collectionRecord.type is not CollectionType.TAGGED: 

1092 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

1093 "expected TAGGED.") 

1094 if recursive: 

1095 refs = DatasetRef.flatten(refs) 

1096 rows = [{"dataset_id": _checkAndGetId(ref), collectionFieldName: collectionRecord.key} 

1097 for ref in refs] 

1098 self._db.delete(self._tables.dataset_collection, ["dataset_id", collectionFieldName], *rows) 

1099 

1100 @transactional 

1101 def insertDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

1102 """Record that a datastore holds the given datasets. 

1103 

1104 Typically used by `Datastore`. 

1105 

1106 Parameters 

1107 ---------- 

1108 datastoreName : `str` 

1109 Name of the datastore holding these datasets. 

1110 refs : `~collections.abc.Iterable` of `DatasetRef` 

1111 References to the datasets. 

1112 

1113 Raises 

1114 ------ 

1115 AmbiguousDatasetError 

1116 Raised if ``any(ref.id is None for ref in refs)``. 

1117 """ 

1118 self._db.insert( 

1119 self._tables.dataset_location, 

1120 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs] 

1121 ) 

1122 

1123 @transactional 

1124 def moveDatasetLocationToTrash(self, datastoreName: str, refs: Iterable[DatasetRef]): 

1125 """Move the dataset location information to trash. 

1126 

1127 Parameters 

1128 ---------- 

1129 datastoreName : `str` 

1130 Name of the datastore holding these datasets. 

1131 refs : `~collections.abc.Iterable` of `DatasetRef` 

1132 References to the datasets. 

1133 """ 

1134 # We only want to move rows that already exist in the main table 

1135 filtered = self.checkDatasetLocations(datastoreName, refs) 

1136 self.canDeleteDatasetLocations(datastoreName, filtered) 

1137 self.removeDatasetLocation(datastoreName, filtered) 

1138 

1139 @transactional 

1140 def canDeleteDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]): 

1141 """Record that a datastore can delete this dataset 

1142 

1143 Parameters 

1144 ---------- 

1145 datastoreName : `str` 

1146 Name of the datastore holding these datasets. 

1147 refs : `~collections.abc.Iterable` of `DatasetRef` 

1148 References to the datasets. 

1149 

1150 Raises 

1151 ------ 

1152 AmbiguousDatasetError 

1153 Raised if ``any(ref.id is None for ref in refs)``. 

1154 """ 

1155 self._db.insert( 

1156 self._tables.dataset_location_trash, 

1157 *[{"datastore_name": datastoreName, "dataset_id": _checkAndGetId(ref)} for ref in refs] 

1158 ) 

1159 

1160 def checkDatasetLocations(self, datastoreName: str, refs: Iterable[DatasetRef]) -> List[DatasetRef]: 

1161 """Check which refs are listed for this datastore. 

1162 

1163 Parameters 

1164 ---------- 

1165 datastoreName : `str` 

1166 Name of the datastore holding these datasets. 

1167 refs : `~collections.abc.Iterable` of `DatasetRef` 

1168 References to the datasets. 

1169 

1170 Returns 

1171 ------- 

1172 present : `list` of `DatasetRef` 

1173 All the `DatasetRef` that are listed. 

1174 """ 

1175 

1176 table = self._tables.dataset_location 

1177 result = self._db.query( 

1178 sqlalchemy.sql.select( 

1179 [table.columns.datastore_name, table.columns.dataset_id] 

1180 ).where( 

1181 sqlalchemy.sql.and_(table.columns.dataset_id.in_([ref.id for ref in refs]), 

1182 table.columns.datastore_name == datastoreName) 

1183 ) 

1184 ).fetchall() 

1185 

1186 matched_ids = {r["dataset_id"] for r in result} 

1187 return [ref for ref in refs if ref.id in matched_ids] 

1188 

1189 def getDatasetLocations(self, ref: DatasetRef) -> Set[str]: 

1190 """Retrieve datastore locations for a given dataset. 

1191 

1192 Typically used by `Datastore`. 

1193 

1194 Parameters 

1195 ---------- 

1196 ref : `DatasetRef` 

1197 A reference to the dataset for which to retrieve storage 

1198 information. 

1199 

1200 Returns 

1201 ------- 

1202 datastores : `set` of `str` 

1203 All the matching datastores holding this dataset. Empty set 

1204 if the dataset does not exist anywhere. 

1205 

1206 Raises 

1207 ------ 

1208 AmbiguousDatasetError 

1209 Raised if ``ref.id`` is `None`. 

1210 """ 

1211 table = self._tables.dataset_location 

1212 result = self._db.query( 

1213 sqlalchemy.sql.select( 

1214 [table.columns.datastore_name] 

1215 ).where( 

1216 table.columns.dataset_id == ref.id 

1217 ) 

1218 ).fetchall() 

1219 return {r["datastore_name"] for r in result} 

1220 

1221 @transactional 

1222 def getTrashedDatasets(self, datastoreName: str) -> Set[FakeDatasetRef]: 

1223 """Retrieve all the dataset ref IDs that are in the trash 

1224 associated with the specified datastore. 

1225 

1226 Parameters 

1227 ---------- 

1228 datastoreName : `str` 

1229 The relevant datastore name to use. 

1230 

1231 Returns 

1232 ------- 

1233 ids : `set` of `FakeDatasetRef` 

1234 The IDs of datasets that can be safely removed from this datastore. 

1235 Can be empty. 

1236 """ 

1237 table = self._tables.dataset_location_trash 

1238 result = self._db.query( 

1239 sqlalchemy.sql.select( 

1240 [table.columns.dataset_id] 

1241 ).where( 

1242 table.columns.datastore_name == datastoreName 

1243 ) 

1244 ).fetchall() 

1245 return {FakeDatasetRef(r["dataset_id"]) for r in result} 

1246 

1247 @transactional 

1248 def emptyDatasetLocationsTrash(self, datastoreName: str, refs: Iterable[FakeDatasetRef]) -> None: 

1249 """Remove datastore location associated with these datasets from trash. 

1250 

1251 Typically used by `Datastore` when a dataset is removed. 

1252 

1253 Parameters 

1254 ---------- 

1255 datastoreName : `str` 

1256 Name of this `Datastore`. 

1257 refs : iterable of `FakeDatasetRef` 

1258 The dataset IDs to be removed. 

1259 

1260 Raises 

1261 ------ 

1262 AmbiguousDatasetError 

1263 Raised if ``ref.id`` is `None`. 

1264 """ 

1265 if not refs: 

1266 return 

1267 self._db.delete( 

1268 self._tables.dataset_location_trash, 

1269 ["dataset_id", "datastore_name"], 

1270 *[{"dataset_id": ref.id, "datastore_name": datastoreName} for ref in refs] 

1271 ) 

1272 

1273 @transactional 

1274 def removeDatasetLocation(self, datastoreName: str, refs: Iterable[DatasetRef]) -> None: 

1275 """Remove datastore location associated with this dataset. 

1276 

1277 Typically used by `Datastore` when a dataset is removed. 

1278 

1279 Parameters 

1280 ---------- 

1281 datastoreName : `str` 

1282 Name of this `Datastore`. 

1283 refs : iterable of `DatasetRef` 

1284 A reference to the dataset for which information is to be removed. 

1285 

1286 Raises 

1287 ------ 

1288 AmbiguousDatasetError 

1289 Raised if ``ref.id`` is `None`. 

1290 """ 

1291 if not refs: 

1292 return 

1293 self._db.delete( 

1294 self._tables.dataset_location, 

1295 ["dataset_id", "datastore_name"], 

1296 *[{"dataset_id": _checkAndGetId(ref), "datastore_name": datastoreName} for ref in refs] 

1297 ) 

1298 

1299 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1300 records: Optional[Mapping[DimensionElement, DimensionRecord]] = None, **kwds): 

1301 """Expand a dimension-based data ID to include additional information. 

1302 

1303 Parameters 

1304 ---------- 

1305 dataId : `DataCoordinate` or `dict`, optional 

1306 Data ID to be expanded; augmented and overridden by ``kwds``. 

1307 graph : `DimensionGraph`, optional 

1308 Set of dimensions for the expanded ID. If `None`, the dimensions 

1309 will be inferred from the keys of ``dataId`` and ``kwds``. 

1310 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1311 are silently ignored, providing a way to extract and expand a 

1312 subset of a data ID. 

1313 records : mapping [`DimensionElement`, `DimensionRecord`], optional 

1314 Dimension record data to use before querying the database for that 

1315 data. 

1316 **kwds 

1317 Additional keywords are treated like additional key-value pairs for 

1318 ``dataId``, extending and overriding 

1319 

1320 Returns 

1321 ------- 

1322 expanded : `ExpandedDataCoordinate` 

1323 A data ID that includes full metadata for all of the dimensions it 

1324 identifieds. 

1325 """ 

1326 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwds) 

1327 if isinstance(standardized, ExpandedDataCoordinate): 

1328 return standardized 

1329 elif isinstance(dataId, ExpandedDataCoordinate): 

1330 records = dict(records) if records is not None else {} 

1331 records.update(dataId.records) 

1332 else: 

1333 records = dict(records) if records is not None else {} 

1334 keys = dict(standardized) 

1335 regions = [] 

1336 timespans = [] 

1337 for element in standardized.graph.primaryKeyTraversalOrder: 

1338 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1339 if record is ...: 

1340 storage = self._dimensions[element] 

1341 record = storage.fetch(keys) 

1342 records[element] = record 

1343 if record is not None: 

1344 for d in element.implied: 

1345 value = getattr(record, d.name) 

1346 if keys.setdefault(d, value) != value: 

1347 raise InconsistentDataIdError(f"Data ID {standardized} has {d.name}={keys[d]!r}, " 

1348 f"but {element.name} implies {d.name}={value!r}.") 

1349 if element in standardized.graph.spatial and record.region is not None: 

1350 if any(record.region.relate(r) & lsst.sphgeom.DISJOINT for r in regions): 

1351 raise InconsistentDataIdError(f"Data ID {standardized}'s region for {element.name} " 

1352 f"is disjoint with those for other elements.") 

1353 regions.append(record.region) 

1354 if element in standardized.graph.temporal: 

1355 if any(not record.timespan.overlaps(t) for t in timespans): 

1356 raise InconsistentDataIdError(f"Data ID {standardized}'s timespan for {element.name}" 

1357 f" is disjoint with those for other elements.") 

1358 timespans.append(record.timespan) 

1359 else: 

1360 if element in standardized.graph.required: 

1361 raise LookupError( 

1362 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1363 ) 

1364 if element.alwaysJoin: 

1365 raise InconsistentDataIdError( 

1366 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1367 f"but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1368 f"related." 

1369 ) 

1370 records.update((d, None) for d in element.implied) 

1371 return ExpandedDataCoordinate(standardized.graph, standardized.values(), records=records) 

1372 

1373 def relateDataIds(self, a: DataId, b: DataId) -> Optional[ConsistentDataIds]: 

1374 """Compare the keys and values of a pair of data IDs for consistency. 

1375 

1376 See `ConsistentDataIds` for more information. 

1377 

1378 Parameters 

1379 ---------- 

1380 a : `dict` or `DataCoordinate` 

1381 First data ID to be compared. 

1382 b : `dict` or `DataCoordinate` 

1383 Second data ID to be compared. 

1384 

1385 Returns 

1386 ------- 

1387 relationship : `ConsistentDataIds` or `None` 

1388 Relationship information. This is not `None` and coerces to 

1389 `True` in boolean contexts if and only if the data IDs are 

1390 consistent in terms of all common key-value pairs, all many-to-many 

1391 join tables, and all spatial andtemporal relationships. 

1392 """ 

1393 a = DataCoordinate.standardize(a, universe=self.dimensions) 

1394 b = DataCoordinate.standardize(b, universe=self.dimensions) 

1395 aFull = getattr(a, "full", None) 

1396 bFull = getattr(b, "full", None) 

1397 aBest = aFull if aFull is not None else a 

1398 bBest = bFull if bFull is not None else b 

1399 jointKeys = aBest.keys() & bBest.keys() 

1400 # If any common values are not equal, we know they are inconsistent. 

1401 if any(aBest[k] != bBest[k] for k in jointKeys): 

1402 return None 

1403 # If the graphs are equal, we know the data IDs are. 

1404 if a.graph == b.graph: 

1405 return ConsistentDataIds(contains=True, within=True, overlaps=bool(jointKeys)) 

1406 # Result is still inconclusive. Try to expand a data ID containing 

1407 # keys from both; that will fail if they are inconsistent. 

1408 # First, if either input was already an ExpandedDataCoordinate, extract 

1409 # its records so we don't have to query for them. 

1410 records = {} 

1411 if hasattr(a, "records"): 

1412 records.update(a.records) 

1413 if hasattr(b, "records"): 

1414 records.update(b.records) 

1415 try: 

1416 self.expandDataId({**a, **b}, graph=(a.graph | b.graph), records=records) 

1417 except InconsistentDataIdError: 

1418 return None 

1419 # We know the answer is not `None`; time to figure out what it is. 

1420 return ConsistentDataIds( 

1421 contains=(a.graph >= b.graph), 

1422 within=(a.graph <= b.graph), 

1423 overlaps=bool(a.graph & b.graph), 

1424 ) 

1425 

1426 def insertDimensionData(self, element: Union[DimensionElement, str], 

1427 *data: Union[dict, DimensionRecord], 

1428 conform: bool = True): 

1429 """Insert one or more dimension records into the database. 

1430 

1431 Parameters 

1432 ---------- 

1433 element : `DimensionElement` or `str` 

1434 The `DimensionElement` or name thereof that identifies the table 

1435 records will be inserted into. 

1436 data : `dict` or `DimensionRecord` (variadic) 

1437 One or more records to insert. 

1438 conform : `bool`, optional 

1439 If `False` (`True` is default) perform no checking or conversions, 

1440 and assume that ``element`` is a `DimensionElement` instance and 

1441 ``data`` is a one or more `DimensionRecord` instances of the 

1442 appropriate subclass. 

1443 """ 

1444 if conform: 

1445 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1446 records = [element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1447 for row in data] 

1448 else: 

1449 records = data 

1450 storage = self._dimensions[element] 

1451 storage.insert(*records) 

1452 

1453 def syncDimensionData(self, element: Union[DimensionElement, str], 

1454 row: Union[dict, DimensionRecord], 

1455 conform: bool = True) -> bool: 

1456 """Synchronize the given dimension record with the database, inserting 

1457 if it does not already exist and comparing values if it does. 

1458 

1459 Parameters 

1460 ---------- 

1461 element : `DimensionElement` or `str` 

1462 The `DimensionElement` or name thereof that identifies the table 

1463 records will be inserted into. 

1464 row : `dict` or `DimensionRecord` 

1465 The record to insert. 

1466 conform : `bool`, optional 

1467 If `False` (`True` is default) perform no checking or conversions, 

1468 and assume that ``element`` is a `DimensionElement` instance and 

1469 ``data`` is a one or more `DimensionRecord` instances of the 

1470 appropriate subclass. 

1471 

1472 Returns 

1473 ------- 

1474 inserted : `bool` 

1475 `True` if a new row was inserted, `False` otherwise. 

1476 

1477 Raises 

1478 ------ 

1479 ConflictingDefinitionError 

1480 Raised if the record exists in the database (according to primary 

1481 key lookup) but is inconsistent with the given one. 

1482 

1483 Notes 

1484 ----- 

1485 This method cannot be called within transactions, as it needs to be 

1486 able to perform its own transaction to be concurrent. 

1487 """ 

1488 if conform: 

1489 element = self.dimensions[element] # if this is a name, convert it to a true DimensionElement. 

1490 record = element.RecordClass.fromDict(row) if not type(row) is element.RecordClass else row 

1491 else: 

1492 record = row 

1493 storage = self._dimensions[element] 

1494 try: 

1495 return storage.sync(record) 

1496 except DatabaseConflictError as err: 

1497 raise ConflictingDefinitionError(str(err)) from err 

1498 

1499 def queryDatasetTypes(self, expression: Any = ...) -> Iterator[DatasetType]: 

1500 """Iterate over the dataset types whose names match an expression. 

1501 

1502 Parameters 

1503 ---------- 

1504 expression : `Any`, optional 

1505 An expression that fully or partially identifies the dataset types 

1506 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1507 `...` can be used to return all dataset types, and is the default. 

1508 See :ref:`daf_butler_dataset_type_expressions` for more 

1509 information. 

1510 

1511 Yields 

1512 ------ 

1513 datasetType : `DatasetType` 

1514 A `DatasetType` instance whose name matches ``expression``. 

1515 """ 

1516 yield from self._datasetStorage.fetchDatasetTypes(expression) 

1517 

1518 def queryCollections(self, expression: Any = ..., 

1519 datasetType: Optional[DatasetType] = None, 

1520 collectionType: Optional[CollectionType] = None, 

1521 flattenChains: bool = False, 

1522 includeChains: Optional[bool] = None) -> Iterator[str]: 

1523 """Iterate over the collections whose names match an expression. 

1524 

1525 Parameters 

1526 ---------- 

1527 expression : `Any`, optional 

1528 An expression that fully or partially identifies the collections 

1529 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1530 `...` can be used to return all collections, and is the default. 

1531 See :ref:`daf_butler_collection_expressions` for more 

1532 information. 

1533 datasetType : `DatasetType`, optional 

1534 If provided, only yield collections that should be searched for 

1535 this dataset type according to ``expression``. If this is 

1536 not provided, any dataset type restrictions in ``expression`` are 

1537 ignored. 

1538 collectionType : `CollectionType`, optional 

1539 If provided, only yield collections of this type. 

1540 flattenChains : `bool`, optional 

1541 If `True` (`False` is default), recursively yield the child 

1542 collections of matching `~CollectionType.CHAINED` collections. 

1543 includeChains : `bool`, optional 

1544 If `True`, yield records for matching `~CollectionType.CHAINED` 

1545 collections. Default is the opposite of ``flattenChains``: include 

1546 either CHAINED collections or their children, but not both. 

1547 

1548 Yields 

1549 ------ 

1550 collection : `str` 

1551 The name of a collection that matches ``expression``. 

1552 """ 

1553 query = CollectionQuery.fromExpression(expression) 

1554 for record in query.iter(self._collections, datasetType=datasetType, collectionType=collectionType, 

1555 flattenChains=flattenChains, includeChains=includeChains): 

1556 yield record.name 

1557 

1558 def makeQueryBuilder(self, summary: QuerySummary) -> QueryBuilder: 

1559 """Return a `QueryBuilder` instance capable of constructing and 

1560 managing more complex queries than those obtainable via `Registry` 

1561 interfaces. 

1562 

1563 This is an advanced interface; downstream code should prefer 

1564 `Registry.queryDimensions` and `Registry.queryDatasets` whenever those 

1565 are sufficient. 

1566 

1567 Parameters 

1568 ---------- 

1569 summary : `QuerySummary` 

1570 Object describing and categorizing the full set of dimensions that 

1571 will be included in the query. 

1572 

1573 Returns 

1574 ------- 

1575 builder : `QueryBuilder` 

1576 Object that can be used to construct and perform advanced queries. 

1577 """ 

1578 return QueryBuilder(connection=self._connection, summary=summary, 

1579 dimensionStorage=self._dimensions, 

1580 datasetStorage=self._datasetStorage) 

1581 

1582 def queryDimensions(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1583 dataId: Optional[DataId] = None, 

1584 datasets: Any = None, 

1585 collections: Any = None, 

1586 where: Optional[str] = None, 

1587 expand: bool = True, 

1588 **kwds) -> Iterator[DataCoordinate]: 

1589 """Query for and iterate over data IDs matching user-provided criteria. 

1590 

1591 Parameters 

1592 ---------- 

1593 dimensions : `Dimension` or `str`, or iterable thereof 

1594 The dimensions of the data IDs to yield, as either `Dimension` 

1595 instances or `str`. Will be automatically expanded to a complete 

1596 `DimensionGraph`. 

1597 dataId : `dict` or `DataCoordinate`, optional 

1598 A data ID whose key-value pairs are used as equality constraints 

1599 in the query. 

1600 datasets : `Any`, optional 

1601 An expression that fully or partially identifies dataset types 

1602 that should constrain the yielded data IDs. For example, including 

1603 "raw" here would constrain the yielded ``instrument``, 

1604 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1605 those for which at least one "raw" dataset exists in 

1606 ``collections``. Allowed types include `DatasetType`, `str`, 

1607 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1608 expressions, `...` is not permitted - it doesn't make sense to 

1609 constrain data IDs on the existence of *all* datasets. 

1610 See :ref:`daf_butler_dataset_type_expressions` for more 

1611 information. 

1612 collections: `Any`, optional 

1613 An expression that fully or partially identifies the collections 

1614 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1615 thereof. `...` can be used to return all collections. Must be 

1616 provided if ``datasets`` is, and is ignored if it is not. See 

1617 :ref:`daf_butler_collection_expressions` for more information. 

1618 where : `str`, optional 

1619 A string expression similar to a SQL WHERE clause. May involve 

1620 any column of a dimension table or (as a shortcut for the primary 

1621 key column of a dimension table) dimension name. See 

1622 :ref:`daf_butler_dimension_expressions` for more information. 

1623 expand : `bool`, optional 

1624 If `True` (default) yield `ExpandedDataCoordinate` instead of 

1625 minimal `DataCoordinate` base-class instances. 

1626 kwds 

1627 Additional keyword arguments are forwarded to 

1628 `DataCoordinate.standardize` when processing the ``dataId`` 

1629 argument (and may be used to provide a constraining data ID even 

1630 when the ``dataId`` argument is `None`). 

1631 

1632 Yields 

1633 ------ 

1634 dataId : `DataCoordinate` 

1635 Data IDs matching the given query parameters. Order is 

1636 unspecified. 

1637 """ 

1638 dimensions = iterable(dimensions) 

1639 standardizedDataId = self.expandDataId(dataId, **kwds) 

1640 standardizedDatasetTypes = [] 

1641 requestedDimensionNames = set(self.dimensions.extract(dimensions).names) 

1642 if datasets is not None: 

1643 if collections is None: 

1644 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1645 for datasetType in self._datasetStorage.fetchDatasetTypes(datasets): 

1646 requestedDimensionNames.update(datasetType.dimensions.names) 

1647 standardizedDatasetTypes.append(datasetType) 

1648 # Preprocess collections expression in case the original included 

1649 # single-pass iterators (we'll want to use it multiple times 

1650 # below). 

1651 collections = CollectionQuery.fromExpression(collections) 

1652 

1653 summary = QuerySummary( 

1654 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1655 dataId=standardizedDataId, 

1656 expression=where, 

1657 ) 

1658 builder = self.makeQueryBuilder(summary) 

1659 for datasetType in standardizedDatasetTypes: 

1660 builder.joinDataset(datasetType, collections, isResult=False) 

1661 query = builder.finish() 

1662 predicate = query.predicate() 

1663 for row in query.execute(): 

1664 if predicate(row): 

1665 result = query.extractDataId(row) 

1666 if expand: 

1667 yield self.expandDataId(result, records=standardizedDataId.records) 

1668 else: 

1669 yield result 

1670 

1671 def queryDatasets(self, datasetType: Any, *, 

1672 collections: Any, 

1673 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1674 dataId: Optional[DataId] = None, 

1675 where: Optional[str] = None, 

1676 deduplicate: bool = False, 

1677 expand: bool = True, 

1678 **kwds) -> Iterator[DatasetRef]: 

1679 """Query for and iterate over dataset references matching user-provided 

1680 criteria. 

1681 

1682 Parameters 

1683 ---------- 

1684 datasetType 

1685 An expression that fully or partially identifies the dataset types 

1686 to be queried. Allowed types include `DatasetType`, `str`, 

1687 `re.Pattern`, and iterables thereof. The special value `...` can 

1688 be used to query all dataset types. See 

1689 :ref:`daf_butler_dataset_type_expressions` for more information. 

1690 collections 

1691 An expression that fully or partially identifies the collections 

1692 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1693 thereof. `...` can be used to return all collections. See 

1694 :ref:`daf_butler_collection_expressions` for more information. 

1695 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1696 Dimensions to include in the query (in addition to those used 

1697 to identify the queried dataset type(s)), either to constrain 

1698 the resulting datasets to those for which a matching dimension 

1699 exists, or to relate the dataset type's dimensions to dimensions 

1700 referenced by the ``dataId`` or ``where`` arguments. 

1701 dataId : `dict` or `DataCoordinate`, optional 

1702 A data ID whose key-value pairs are used as equality constraints 

1703 in the query. 

1704 where : `str`, optional 

1705 A string expression similar to a SQL WHERE clause. May involve 

1706 any column of a dimension table or (as a shortcut for the primary 

1707 key column of a dimension table) dimension name. See 

1708 :ref:`daf_butler_dimension_expressions` for more information. 

1709 deduplicate : `bool`, optional 

1710 If `True` (`False` is default), for each result data ID, only 

1711 yield one `DatasetRef` of each `DatasetType`, from the first 

1712 collection in which a dataset of that dataset type appears 

1713 (according to the order of ``collections`` passed in). If `True`, 

1714 ``collections`` must not contain regular expressions and may not 

1715 be `...`. 

1716 expand : `bool`, optional 

1717 If `True` (default) attach `ExpandedDataCoordinate` instead of 

1718 minimal `DataCoordinate` base-class instances. 

1719 kwds 

1720 Additional keyword arguments are forwarded to 

1721 `DataCoordinate.standardize` when processing the ``dataId`` 

1722 argument (and may be used to provide a constraining data ID even 

1723 when the ``dataId`` argument is `None`). 

1724 

1725 Yields 

1726 ------ 

1727 ref : `DatasetRef` 

1728 Dataset references matching the given query criteria. These 

1729 are grouped by `DatasetType` if the query evaluates to multiple 

1730 dataset types, but order is otherwise unspecified. 

1731 

1732 Raises 

1733 ------ 

1734 TypeError 

1735 Raised when the arguments are incompatible, such as when a 

1736 collection wildcard is passed when ``deduplicate`` is `True`. 

1737 

1738 Notes 

1739 ----- 

1740 When multiple dataset types are queried in a single call, the 

1741 results of this operation are equivalent to querying for each dataset 

1742 type separately in turn, and no information about the relationships 

1743 between datasets of different types is included. In contexts where 

1744 that kind of information is important, the recommended pattern is to 

1745 use `queryDimensions` to first obtain data IDs (possibly with the 

1746 desired dataset types and collections passed as constraints to the 

1747 query), and then use multiple (generally much simpler) calls to 

1748 `queryDatasets` with the returned data IDs passed as constraints. 

1749 """ 

1750 # Standardize the collections expression. 

1751 if deduplicate: 

1752 collections = CollectionSearch.fromExpression(collections) 

1753 else: 

1754 collections = CollectionQuery.fromExpression(collections) 

1755 # Standardize and expand the data ID provided as a constraint. 

1756 standardizedDataId = self.expandDataId(dataId, **kwds) 

1757 # If the datasetType passed isn't actually a DatasetType, expand it 

1758 # (it could be an expression that yields multiple DatasetTypes) and 

1759 # recurse. 

1760 if not isinstance(datasetType, DatasetType): 

1761 for trueDatasetType in self._datasetStorage.fetchDatasetTypes(datasetType): 

1762 yield from self.queryDatasets(trueDatasetType, collections=collections, 

1763 dimensions=dimensions, dataId=standardizedDataId, 

1764 where=where, deduplicate=deduplicate) 

1765 return 

1766 # The full set of dimensions in the query is the combination of those 

1767 # needed for the DatasetType and those explicitly requested, if any. 

1768 requestedDimensionNames = set(datasetType.dimensions.names) 

1769 if dimensions is not None: 

1770 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1771 # Construct the summary structure needed to construct a QueryBuilder. 

1772 summary = QuerySummary( 

1773 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1774 dataId=standardizedDataId, 

1775 expression=where, 

1776 ) 

1777 builder = self.makeQueryBuilder(summary) 

1778 # Add the dataset subquery to the query, telling the QueryBuilder to 

1779 # include the rank of the selected collection in the results only if we 

1780 # need to deduplicate. Note that if any of the collections are 

1781 # actually wildcard expressions, and we've asked for deduplication, 

1782 # this will raise TypeError for us. 

1783 if not builder.joinDataset(datasetType, collections, isResult=True, addRank=deduplicate): 

1784 return 

1785 query = builder.finish() 

1786 predicate = query.predicate() 

1787 if not deduplicate: 

1788 # No need to de-duplicate across collections. 

1789 for row in query.execute(): 

1790 if predicate(row): 

1791 dataId = query.extractDataId(row, graph=datasetType.dimensions) 

1792 if expand: 

1793 dataId = self.expandDataId(dataId, records=standardizedDataId.records) 

1794 yield query.extractDatasetRef(row, datasetType, dataId)[0] 

1795 else: 

1796 # For each data ID, yield only the DatasetRef with the lowest 

1797 # collection rank. 

1798 bestRefs = {} 

1799 bestRanks = {} 

1800 for row in query.execute(): 

1801 if predicate(row): 

1802 ref, rank = query.extractDatasetRef(row, datasetType) 

1803 bestRank = bestRanks.get(ref.dataId, sys.maxsize) 

1804 if rank < bestRank: 

1805 bestRefs[ref.dataId] = ref 

1806 bestRanks[ref.dataId] = rank 

1807 # If caller requested expanded data IDs, we defer that until here 

1808 # so we do as little expansion as possible. 

1809 if expand: 

1810 for ref in bestRefs.values(): 

1811 dataId = self.expandDataId(ref.dataId, records=standardizedDataId.records) 

1812 yield ref.expanded(dataId) 

1813 else: 

1814 yield from bestRefs.values() 

1815 

1816 dimensions: DimensionUniverse 

1817 """The universe of all dimensions known to the registry 

1818 (`DimensionUniverse`). 

1819 """ 

1820 

1821 storageClasses: StorageClassFactory 

1822 """All storage classes known to the registry (`StorageClassFactory`). 

1823 """