Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Type, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from ..core import ( 

48 Config, 

49 DataCoordinate, 

50 DataCoordinateIterable, 

51 DataId, 

52 DatasetAssociation, 

53 DatasetRef, 

54 DatasetType, 

55 ddl, 

56 Dimension, 

57 DimensionConfig, 

58 DimensionElement, 

59 DimensionGraph, 

60 DimensionRecord, 

61 DimensionUniverse, 

62 NamedKeyMapping, 

63 NameLookupMapping, 

64 StorageClassFactory, 

65 Timespan, 

66) 

67from . import queries 

68from ..core.utils import doImport, iterable, transactional 

69from ._config import RegistryConfig 

70from ._collectionType import CollectionType 

71from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

72from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

73from .interfaces import ChainedCollectionRecord, RunRecord 

74from .versions import ButlerVersionsManager, DigestMismatchError 

75 

76if TYPE_CHECKING: 76 ↛ 77line 76 didn't jump to line 77, because the condition on line 76 was never true

77 from ..butlerConfig import ButlerConfig 

78 from .interfaces import ( 

79 ButlerAttributeManager, 

80 CollectionManager, 

81 Database, 

82 OpaqueTableStorageManager, 

83 DimensionRecordStorageManager, 

84 DatasetRecordStorageManager, 

85 DatastoreRegistryBridgeManager, 

86 ) 

87 

88 

89_LOG = logging.getLogger(__name__) 

90 

91# key for dimensions configuration in attributes table 

92_DIMENSIONS_ATTR = "config:dimensions.json" 

93 

94 

95class Registry: 

96 """Registry interface. 

97 

98 Parameters 

99 ---------- 

100 database : `Database` 

101 Database instance to store Registry. 

102 attributes : `type` 

103 Manager class implementing `ButlerAttributeManager`. 

104 opaque : `type` 

105 Manager class implementing `OpaqueTableStorageManager`. 

106 dimensions : `type` 

107 Manager class implementing `DimensionRecordStorageManager`. 

108 collections : `type` 

109 Manager class implementing `CollectionManager`. 

110 datasets : `type` 

111 Manager class implementing `DatasetRecordStorageManager`. 

112 datastoreBridges : `type` 

113 Manager class implementing `DatastoreRegistryBridgeManager`. 

114 dimensionConfig : `DimensionConfig`, optional 

115 Dimension universe configuration, only used when ``create`` is True. 

116 writeable : `bool`, optional 

117 If True then Registry will support write operations. 

118 create : `bool`, optional 

119 If True then database schema will be initialized, it must be empty 

120 before instantiating Registry. 

121 """ 

122 

123 defaultConfigFile: Optional[str] = None 

124 """Path to configuration defaults. Accessed within the ``configs`` resource 

125 or relative to a search path. Can be None if no defaults specified. 

126 """ 

127 

128 @classmethod 

129 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

130 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

131 butlerRoot: Optional[str] = None) -> Registry: 

132 """Create registry database and return `Registry` instance. 

133 

134 This method initializes database contents, database must be empty 

135 prior to calling this method. 

136 

137 Parameters 

138 ---------- 

139 config : `RegistryConfig` or `str`, optional 

140 Registry configuration, if missing then default configuration will 

141 be loaded from registry.yaml. 

142 dimensionConfig : `DimensionConfig` or `str`, optional 

143 Dimensions configuration, if missing then default configuration 

144 will be loaded from dimensions.yaml. 

145 butlerRoot : `str`, optional 

146 Path to the repository root this `Registry` will manage. 

147 

148 Returns 

149 ------- 

150 registry : `Registry` 

151 A new `Registry` instance. 

152 """ 

153 if isinstance(config, str): 

154 config = RegistryConfig(config) 

155 elif config is None: 

156 config = RegistryConfig() 

157 elif not isinstance(config, RegistryConfig): 

158 raise TypeError(f"Incompatible Registry configuration type: {type(config)}") 

159 config.replaceRoot(butlerRoot) 

160 

161 if isinstance(dimensionConfig, str): 

162 dimensionConfig = DimensionConfig(config) 

163 elif dimensionConfig is None: 

164 dimensionConfig = DimensionConfig() 

165 elif not isinstance(dimensionConfig, DimensionConfig): 

166 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

167 

168 DatabaseClass = config.getDatabaseClass() 

169 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

170 namespace=config.get("namespace")) 

171 attributes = doImport(config["managers", "attributes"]) 

172 opaque = doImport(config["managers", "opaque"]) 

173 dimensions = doImport(config["managers", "dimensions"]) 

174 collections = doImport(config["managers", "collections"]) 

175 datasets = doImport(config["managers", "datasets"]) 

176 datastoreBridges = doImport(config["managers", "datastores"]) 

177 

178 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque, 

179 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

180 dimensionConfig=dimensionConfig, create=True) 

181 

182 @classmethod 

183 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

184 butlerRoot: Optional[str] = None, writeable: bool = True) -> Registry: 

185 """Create `Registry` subclass instance from `config`. 

186 

187 Registry database must be inbitialized prior to calling this method. 

188 

189 Parameters 

190 ---------- 

191 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

192 Registry configuration 

193 butlerRoot : `str`, optional 

194 Path to the repository root this `Registry` will manage. 

195 writeable : `bool`, optional 

196 If `True` (default) create a read-write connection to the database. 

197 

198 Returns 

199 ------- 

200 registry : `Registry` (subclass) 

201 A new `Registry` subclass instance. 

202 """ 

203 if not isinstance(config, RegistryConfig): 

204 if isinstance(config, str) or isinstance(config, Config): 

205 config = RegistryConfig(config) 

206 else: 

207 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

208 config.replaceRoot(butlerRoot) 

209 DatabaseClass = config.getDatabaseClass() 

210 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

211 namespace=config.get("namespace"), writeable=writeable) 

212 attributes = doImport(config["managers", "attributes"]) 

213 opaque = doImport(config["managers", "opaque"]) 

214 dimensions = doImport(config["managers", "dimensions"]) 

215 collections = doImport(config["managers", "collections"]) 

216 datasets = doImport(config["managers", "datasets"]) 

217 datastoreBridges = doImport(config["managers", "datastores"]) 

218 

219 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque, 

220 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

221 dimensionConfig=None, writeable=writeable) 

222 

223 def __init__(self, database: Database, *, 

224 attributes: Type[ButlerAttributeManager], 

225 opaque: Type[OpaqueTableStorageManager], 

226 dimensions: Type[DimensionRecordStorageManager], 

227 collections: Type[CollectionManager], 

228 datasets: Type[DatasetRecordStorageManager], 

229 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

230 dimensionConfig: Optional[DimensionConfig] = None, 

231 writeable: bool = True, 

232 create: bool = False): 

233 self._db = database 

234 self.storageClasses = StorageClassFactory() 

235 

236 # With existing registry we have to read dimensions config from 

237 # database before we initialize all other managers. 

238 if dimensionConfig is None: 

239 assert not create, "missing DimensionConfig when create=True" 

240 with self._db.declareStaticTables(create=False) as context: 

241 self._attributes = attributes.initialize(self._db, context) 

242 

243 versions = ButlerVersionsManager( 

244 self._attributes, 

245 dict(attributes=self._attributes) 

246 ) 

247 # verify that configured versions are compatible with schema 

248 versions.checkManagersConfig() 

249 versions.checkManagersVersions(writeable) 

250 

251 # get serialized as a string from database 

252 dimensionsString = self._attributes.get(_DIMENSIONS_ATTR) 

253 if dimensionsString is not None: 

254 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json")) 

255 else: 

256 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database") 

257 

258 # make universe 

259 universe = DimensionUniverse(dimensionConfig) 

260 

261 with self._db.declareStaticTables(create=create) as context: 

262 self._attributes = attributes.initialize(self._db, context) 

263 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

264 self._collections = collections.initialize(self._db, context, dimensions=self._dimensions) 

265 self._datasets = datasets.initialize(self._db, context, 

266 collections=self._collections, 

267 dimensions=self._dimensions) 

268 self._opaque = opaque.initialize(self._db, context) 

269 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

270 opaque=self._opaque, 

271 datasets=datasets, 

272 universe=self._dimensions.universe) 

273 versions = ButlerVersionsManager( 

274 self._attributes, 

275 dict( 

276 attributes=self._attributes, 

277 opaque=self._opaque, 

278 dimensions=self._dimensions, 

279 collections=self._collections, 

280 datasets=self._datasets, 

281 datastores=self._datastoreBridges, 

282 ) 

283 ) 

284 # store managers and their versions in attributes table 

285 context.addInitializer(lambda db: versions.storeManagersConfig()) 

286 context.addInitializer(lambda db: versions.storeManagersVersions()) 

287 # dump universe config as json into attributes (faster than YAML) 

288 json = dimensionConfig.dump(format="json") 

289 if json is not None: 

290 # Convert Optional[str] to str for mypy 

291 json_str = json 

292 context.addInitializer( 

293 lambda db: self._attributes.set(_DIMENSIONS_ATTR, json_str) 

294 ) 

295 else: 

296 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON") 

297 

298 if not create: 

299 # verify that configured versions are compatible with schema 

300 versions.checkManagersConfig() 

301 versions.checkManagersVersions(writeable) 

302 try: 

303 versions.checkManagersDigests() 

304 except DigestMismatchError as exc: 

305 # potentially digest mismatch is a serious error but during 

306 # development it could be benign, treat this as warning for 

307 # now. 

308 _LOG.warning(f"Registry schema digest mismatch: {exc}") 

309 

310 self._dimensions.refresh() 

311 self._collections.refresh() 

312 self._datasets.refresh() 

313 

314 def __str__(self) -> str: 

315 return str(self._db) 

316 

317 def __repr__(self) -> str: 

318 return f"Registry({self._db!r}, {self.dimensions!r})" 

319 

320 def isWriteable(self) -> bool: 

321 """Return `True` if this registry allows write operations, and `False` 

322 otherwise. 

323 """ 

324 return self._db.isWriteable() 

325 

326 @property 

327 def dimensions(self) -> DimensionUniverse: 

328 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

329 """ 

330 return self._dimensions.universe 

331 

332 @contextlib.contextmanager 

333 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

334 """Return a context manager that represents a transaction. 

335 """ 

336 try: 

337 with self._db.transaction(savepoint=savepoint): 

338 yield 

339 except BaseException: 

340 # TODO: this clears the caches sometimes when we wouldn't actually 

341 # need to. Can we avoid that? 

342 self._dimensions.clearCaches() 

343 raise 

344 

345 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

346 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

347 other data repository client. 

348 

349 Opaque table records can be added via `insertOpaqueData`, retrieved via 

350 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

351 

352 Parameters 

353 ---------- 

354 tableName : `str` 

355 Logical name of the opaque table. This may differ from the 

356 actual name used in the database by a prefix and/or suffix. 

357 spec : `ddl.TableSpec` 

358 Specification for the table to be added. 

359 """ 

360 self._opaque.register(tableName, spec) 

361 

362 @transactional 

363 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

364 """Insert records into an opaque table. 

365 

366 Parameters 

367 ---------- 

368 tableName : `str` 

369 Logical name of the opaque table. Must match the name used in a 

370 previous call to `registerOpaqueTable`. 

371 data 

372 Each additional positional argument is a dictionary that represents 

373 a single row to be added. 

374 """ 

375 self._opaque[tableName].insert(*data) 

376 

377 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

378 """Retrieve records from an opaque table. 

379 

380 Parameters 

381 ---------- 

382 tableName : `str` 

383 Logical name of the opaque table. Must match the name used in a 

384 previous call to `registerOpaqueTable`. 

385 where 

386 Additional keyword arguments are interpreted as equality 

387 constraints that restrict the returned rows (combined with AND); 

388 keyword arguments are column names and values are the values they 

389 must have. 

390 

391 Yields 

392 ------ 

393 row : `dict` 

394 A dictionary representing a single result row. 

395 """ 

396 yield from self._opaque[tableName].fetch(**where) 

397 

398 @transactional 

399 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

400 """Remove records from an opaque table. 

401 

402 Parameters 

403 ---------- 

404 tableName : `str` 

405 Logical name of the opaque table. Must match the name used in a 

406 previous call to `registerOpaqueTable`. 

407 where 

408 Additional keyword arguments are interpreted as equality 

409 constraints that restrict the deleted rows (combined with AND); 

410 keyword arguments are column names and values are the values they 

411 must have. 

412 """ 

413 self._opaque[tableName].delete(**where) 

414 

415 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None: 

416 """Add a new collection if one with the given name does not exist. 

417 

418 Parameters 

419 ---------- 

420 name : `str` 

421 The name of the collection to create. 

422 type : `CollectionType` 

423 Enum value indicating the type of collection to create. 

424 

425 Notes 

426 ----- 

427 This method cannot be called within transactions, as it needs to be 

428 able to perform its own transaction to be concurrent. 

429 """ 

430 self._collections.register(name, type) 

431 

432 def getCollectionType(self, name: str) -> CollectionType: 

433 """Return an enumeration value indicating the type of the given 

434 collection. 

435 

436 Parameters 

437 ---------- 

438 name : `str` 

439 The name of the collection. 

440 

441 Returns 

442 ------- 

443 type : `CollectionType` 

444 Enum value indicating the type of this collection. 

445 

446 Raises 

447 ------ 

448 MissingCollectionError 

449 Raised if no collection with the given name exists. 

450 """ 

451 return self._collections.find(name).type 

452 

453 def registerRun(self, name: str) -> None: 

454 """Add a new run if one with the given name does not exist. 

455 

456 Parameters 

457 ---------- 

458 name : `str` 

459 The name of the run to create. 

460 

461 Notes 

462 ----- 

463 This method cannot be called within transactions, as it needs to be 

464 able to perform its own transaction to be concurrent. 

465 """ 

466 self._collections.register(name, CollectionType.RUN) 

467 

468 @transactional 

469 def removeCollection(self, name: str) -> None: 

470 """Completely remove the given collection. 

471 

472 Parameters 

473 ---------- 

474 name : `str` 

475 The name of the collection to remove. 

476 

477 Raises 

478 ------ 

479 MissingCollectionError 

480 Raised if no collection with the given name exists. 

481 

482 Notes 

483 ----- 

484 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

485 in it are also fully removed. This requires that those datasets be 

486 removed (or at least trashed) from any datastores that hold them first. 

487 

488 A collection may not be deleted as long as it is referenced by a 

489 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

490 be deleted or redefined first. 

491 """ 

492 self._collections.remove(name) 

493 

494 def getCollectionChain(self, parent: str) -> CollectionSearch: 

495 """Return the child collections in a `~CollectionType.CHAINED` 

496 collection. 

497 

498 Parameters 

499 ---------- 

500 parent : `str` 

501 Name of the chained collection. Must have already been added via 

502 a call to `Registry.registerCollection`. 

503 

504 Returns 

505 ------- 

506 children : `CollectionSearch` 

507 An object that defines the search path of the collection. 

508 See :ref:`daf_butler_collection_expressions` for more information. 

509 

510 Raises 

511 ------ 

512 MissingCollectionError 

513 Raised if ``parent`` does not exist in the `Registry`. 

514 TypeError 

515 Raised if ``parent`` does not correspond to a 

516 `~CollectionType.CHAINED` collection. 

517 """ 

518 record = self._collections.find(parent) 

519 if record.type is not CollectionType.CHAINED: 

520 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

521 assert isinstance(record, ChainedCollectionRecord) 

522 return record.children 

523 

524 @transactional 

525 def setCollectionChain(self, parent: str, children: Any) -> None: 

526 """Define or redefine a `~CollectionType.CHAINED` collection. 

527 

528 Parameters 

529 ---------- 

530 parent : `str` 

531 Name of the chained collection. Must have already been added via 

532 a call to `Registry.registerCollection`. 

533 children : `Any` 

534 An expression defining an ordered search of child collections, 

535 generally an iterable of `str`; see 

536 :ref:`daf_butler_collection_expressions` for more information. 

537 

538 Raises 

539 ------ 

540 MissingCollectionError 

541 Raised when any of the given collections do not exist in the 

542 `Registry`. 

543 TypeError 

544 Raised if ``parent`` does not correspond to a 

545 `~CollectionType.CHAINED` collection. 

546 ValueError 

547 Raised if the given collections contains a cycle. 

548 """ 

549 record = self._collections.find(parent) 

550 if record.type is not CollectionType.CHAINED: 

551 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

552 assert isinstance(record, ChainedCollectionRecord) 

553 children = CollectionSearch.fromExpression(children) 

554 if children != record.children: 

555 record.update(self._collections, children) 

556 

557 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

558 """ 

559 Add a new `DatasetType` to the Registry. 

560 

561 It is not an error to register the same `DatasetType` twice. 

562 

563 Parameters 

564 ---------- 

565 datasetType : `DatasetType` 

566 The `DatasetType` to be added. 

567 

568 Returns 

569 ------- 

570 inserted : `bool` 

571 `True` if ``datasetType`` was inserted, `False` if an identical 

572 existing `DatsetType` was found. Note that in either case the 

573 DatasetType is guaranteed to be defined in the Registry 

574 consistently with the given definition. 

575 

576 Raises 

577 ------ 

578 ValueError 

579 Raised if the dimensions or storage class are invalid. 

580 ConflictingDefinitionError 

581 Raised if this DatasetType is already registered with a different 

582 definition. 

583 

584 Notes 

585 ----- 

586 This method cannot be called within transactions, as it needs to be 

587 able to perform its own transaction to be concurrent. 

588 """ 

589 _, inserted = self._datasets.register(datasetType) 

590 return inserted 

591 

592 def removeDatasetType(self, name: str) -> None: 

593 """Remove the named `DatasetType` from the registry. 

594 

595 .. warning:: 

596 

597 Registry caches the dataset type definitions. This means that 

598 deleting the dataset type definition may result in unexpected 

599 behavior from other butler processes that are active that have 

600 not seen the deletion. 

601 

602 Parameters 

603 ---------- 

604 name : `str` 

605 Name of the type to be removed. 

606 

607 Raises 

608 ------ 

609 lsst.daf.butler.registry.OrphanedRecordError 

610 Raised if an attempt is made to remove the dataset type definition 

611 when there are already datasets associated with it. 

612 

613 Notes 

614 ----- 

615 If the dataset type is not registered the method will return without 

616 action. 

617 """ 

618 self._datasets.remove(name) 

619 

620 def getDatasetType(self, name: str) -> DatasetType: 

621 """Get the `DatasetType`. 

622 

623 Parameters 

624 ---------- 

625 name : `str` 

626 Name of the type. 

627 

628 Returns 

629 ------- 

630 type : `DatasetType` 

631 The `DatasetType` associated with the given name. 

632 

633 Raises 

634 ------ 

635 KeyError 

636 Requested named DatasetType could not be found in registry. 

637 """ 

638 return self._datasets[name].datasetType 

639 

640 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

641 collections: Any, timespan: Optional[Timespan] = None, 

642 **kwargs: Any) -> Optional[DatasetRef]: 

643 """Find a dataset given its `DatasetType` and data ID. 

644 

645 This can be used to obtain a `DatasetRef` that permits the dataset to 

646 be read from a `Datastore`. If the dataset is a component and can not 

647 be found using the provided dataset type, a dataset ref for the parent 

648 will be returned instead but with the correct dataset type. 

649 

650 Parameters 

651 ---------- 

652 datasetType : `DatasetType` or `str` 

653 A `DatasetType` or the name of one. 

654 dataId : `dict` or `DataCoordinate`, optional 

655 A `dict`-like object containing the `Dimension` links that identify 

656 the dataset within a collection. 

657 collections 

658 An expression that fully or partially identifies the collections to 

659 search for the dataset; see 

660 :ref:`daf_butler_collection_expressions` for more information. 

661 timespan : `Timespan`, optional 

662 A timespan that the validity range of the dataset must overlap. 

663 If not provided, any `~CollectionType.CALIBRATION` collections 

664 matched by the ``collections`` argument will not be searched. 

665 **kwargs 

666 Additional keyword arguments passed to 

667 `DataCoordinate.standardize` to convert ``dataId`` to a true 

668 `DataCoordinate` or augment an existing one. 

669 

670 Returns 

671 ------- 

672 ref : `DatasetRef` 

673 A reference to the dataset, or `None` if no matching Dataset 

674 was found. 

675 

676 Raises 

677 ------ 

678 LookupError 

679 Raised if one or more data ID keys are missing. 

680 KeyError 

681 Raised if the dataset type does not exist. 

682 MissingCollectionError 

683 Raised if any of ``collections`` does not exist in the registry. 

684 

685 Notes 

686 ----- 

687 This method simply returns `None` and does not raise an exception even 

688 when the set of collections searched is intrinsically incompatible with 

689 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but 

690 only `~CollectionType.CALIBRATION` collections are being searched. 

691 This may make it harder to debug some lookup failures, but the behavior 

692 is intentional; we consider it more important that failed searches are 

693 reported consistently, regardless of the reason, and that adding 

694 additional collections that do not contain a match to the search path 

695 never changes the behavior. 

696 """ 

697 if isinstance(datasetType, DatasetType): 

698 storage = self._datasets[datasetType.name] 

699 else: 

700 storage = self._datasets[datasetType] 

701 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

702 universe=self.dimensions, **kwargs) 

703 collections = CollectionSearch.fromExpression(collections) 

704 for collectionRecord in collections.iter(self._collections): 

705 if (collectionRecord.type is CollectionType.CALIBRATION 

706 and (not storage.datasetType.isCalibration() or timespan is None)): 

707 continue 

708 result = storage.find(collectionRecord, dataId, timespan=timespan) 

709 if result is not None: 

710 return result 

711 

712 return None 

713 

714 @transactional 

715 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

716 run: str) -> List[DatasetRef]: 

717 """Insert one or more datasets into the `Registry` 

718 

719 This always adds new datasets; to associate existing datasets with 

720 a new collection, use ``associate``. 

721 

722 Parameters 

723 ---------- 

724 datasetType : `DatasetType` or `str` 

725 A `DatasetType` or the name of one. 

726 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

727 Dimension-based identifiers for the new datasets. 

728 run : `str` 

729 The name of the run that produced the datasets. 

730 

731 Returns 

732 ------- 

733 refs : `list` of `DatasetRef` 

734 Resolved `DatasetRef` instances for all given data IDs (in the same 

735 order). 

736 

737 Raises 

738 ------ 

739 ConflictingDefinitionError 

740 If a dataset with the same dataset type and data ID as one of those 

741 given already exists in ``run``. 

742 MissingCollectionError 

743 Raised if ``run`` does not exist in the registry. 

744 """ 

745 if isinstance(datasetType, DatasetType): 

746 storage = self._datasets.find(datasetType.name) 

747 if storage is None: 

748 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

749 else: 

750 storage = self._datasets.find(datasetType) 

751 if storage is None: 

752 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

753 runRecord = self._collections.find(run) 

754 if runRecord.type is not CollectionType.RUN: 

755 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

756 assert isinstance(runRecord, RunRecord) 

757 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

758 for dataId in dataIds] 

759 try: 

760 refs = list(storage.insert(runRecord, expandedDataIds)) 

761 except sqlalchemy.exc.IntegrityError as err: 

762 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

763 f"one or more datasets of type {storage.datasetType} into " 

764 f"collection '{run}'. " 

765 f"This probably means a dataset with the same data ID " 

766 f"and dataset type already exists, but it may also mean a " 

767 f"dimension row is missing.") from err 

768 return refs 

769 

770 def getDataset(self, id: int) -> Optional[DatasetRef]: 

771 """Retrieve a Dataset entry. 

772 

773 Parameters 

774 ---------- 

775 id : `int` 

776 The unique identifier for the dataset. 

777 

778 Returns 

779 ------- 

780 ref : `DatasetRef` or `None` 

781 A ref to the Dataset, or `None` if no matching Dataset 

782 was found. 

783 """ 

784 ref = self._datasets.getDatasetRef(id) 

785 if ref is None: 

786 return None 

787 return ref 

788 

789 @transactional 

790 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

791 """Remove datasets from the Registry. 

792 

793 The datasets will be removed unconditionally from all collections, and 

794 any `Quantum` that consumed this dataset will instead be marked with 

795 having a NULL input. `Datastore` records will *not* be deleted; the 

796 caller is responsible for ensuring that the dataset has already been 

797 removed from all Datastores. 

798 

799 Parameters 

800 ---------- 

801 refs : `Iterable` of `DatasetRef` 

802 References to the datasets to be removed. Must include a valid 

803 ``id`` attribute, and should be considered invalidated upon return. 

804 

805 Raises 

806 ------ 

807 AmbiguousDatasetError 

808 Raised if any ``ref.id`` is `None`. 

809 OrphanedRecordError 

810 Raised if any dataset is still present in any `Datastore`. 

811 """ 

812 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

813 storage = self._datasets.find(datasetType.name) 

814 assert storage is not None 

815 try: 

816 storage.delete(refsForType) 

817 except sqlalchemy.exc.IntegrityError as err: 

818 raise OrphanedRecordError("One or more datasets is still " 

819 "present in one or more Datastores.") from err 

820 

821 @transactional 

822 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

823 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

824 

825 If a DatasetRef with the same exact integer ID is already in a 

826 collection nothing is changed. If a `DatasetRef` with the same 

827 `DatasetType` and data ID but with different integer ID 

828 exists in the collection, `ConflictingDefinitionError` is raised. 

829 

830 Parameters 

831 ---------- 

832 collection : `str` 

833 Indicates the collection the datasets should be associated with. 

834 refs : `Iterable` [ `DatasetRef` ] 

835 An iterable of resolved `DatasetRef` instances that already exist 

836 in this `Registry`. 

837 

838 Raises 

839 ------ 

840 ConflictingDefinitionError 

841 If a Dataset with the given `DatasetRef` already exists in the 

842 given collection. 

843 AmbiguousDatasetError 

844 Raised if ``any(ref.id is None for ref in refs)``. 

845 MissingCollectionError 

846 Raised if ``collection`` does not exist in the registry. 

847 TypeError 

848 Raise adding new datasets to the given ``collection`` is not 

849 allowed. 

850 """ 

851 collectionRecord = self._collections.find(collection) 

852 if collectionRecord.type is not CollectionType.TAGGED: 

853 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

854 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

855 storage = self._datasets.find(datasetType.name) 

856 assert storage is not None 

857 try: 

858 storage.associate(collectionRecord, refsForType) 

859 except sqlalchemy.exc.IntegrityError as err: 

860 raise ConflictingDefinitionError( 

861 f"Constraint violation while associating dataset of type {datasetType.name} with " 

862 f"collection {collection}. This probably means that one or more datasets with the same " 

863 f"dataset type and data ID already exist in the collection, but it may also indicate " 

864 f"that the datasets do not exist." 

865 ) from err 

866 

867 @transactional 

868 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

869 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

870 

871 ``collection`` and ``ref`` combinations that are not currently 

872 associated are silently ignored. 

873 

874 Parameters 

875 ---------- 

876 collection : `str` 

877 The collection the datasets should no longer be associated with. 

878 refs : `Iterable` [ `DatasetRef` ] 

879 An iterable of resolved `DatasetRef` instances that already exist 

880 in this `Registry`. 

881 

882 Raises 

883 ------ 

884 AmbiguousDatasetError 

885 Raised if any of the given dataset references is unresolved. 

886 MissingCollectionError 

887 Raised if ``collection`` does not exist in the registry. 

888 TypeError 

889 Raise adding new datasets to the given ``collection`` is not 

890 allowed. 

891 """ 

892 collectionRecord = self._collections.find(collection) 

893 if collectionRecord.type is not CollectionType.TAGGED: 

894 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

895 "expected TAGGED.") 

896 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

897 storage = self._datasets.find(datasetType.name) 

898 assert storage is not None 

899 storage.disassociate(collectionRecord, refsForType) 

900 

901 @transactional 

902 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

903 """Associate one or more datasets with a calibration collection and a 

904 validity range within it. 

905 

906 Parameters 

907 ---------- 

908 collection : `str` 

909 The name of an already-registered `~CollectionType.CALIBRATION` 

910 collection. 

911 refs : `Iterable` [ `DatasetRef` ] 

912 Datasets to be associated. 

913 timespan : `Timespan` 

914 The validity range for these datasets within the collection. 

915 

916 Raises 

917 ------ 

918 AmbiguousDatasetError 

919 Raised if any of the given `DatasetRef` instances is unresolved. 

920 ConflictingDefinitionError 

921 Raised if the collection already contains a different dataset with 

922 the same `DatasetType` and data ID and an overlapping validity 

923 range. 

924 TypeError 

925 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

926 collection or if one or more datasets are of a dataset type for 

927 which `DatasetType.isCalibration` returns `False`. 

928 """ 

929 collectionRecord = self._collections.find(collection) 

930 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

931 storage = self._datasets[datasetType.name] 

932 storage.certify(collectionRecord, refsForType, timespan) 

933 

934 @transactional 

935 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

936 dataIds: Optional[Iterable[DataId]] = None) -> None: 

937 """Remove or adjust datasets to clear a validity range within a 

938 calibration collection. 

939 

940 Parameters 

941 ---------- 

942 collection : `str` 

943 The name of an already-registered `~CollectionType.CALIBRATION` 

944 collection. 

945 datasetType : `str` or `DatasetType` 

946 Name or `DatasetType` instance for the datasets to be decertified. 

947 timespan : `Timespan`, optional 

948 The validity range to remove datasets from within the collection. 

949 Datasets that overlap this range but are not contained by it will 

950 have their validity ranges adjusted to not overlap it, which may 

951 split a single dataset validity range into two. 

952 dataIds : `Iterable` [ `DataId` ], optional 

953 Data IDs that should be decertified within the given validity range 

954 If `None`, all data IDs for ``self.datasetType`` will be 

955 decertified. 

956 

957 Raises 

958 ------ 

959 TypeError 

960 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

961 collection or if ``datasetType.isCalibration() is False``. 

962 """ 

963 collectionRecord = self._collections.find(collection) 

964 if isinstance(datasetType, str): 

965 storage = self._datasets[datasetType] 

966 else: 

967 storage = self._datasets[datasetType.name] 

968 standardizedDataIds = None 

969 if dataIds is not None: 

970 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

971 for d in dataIds] 

972 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

973 

974 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

975 """Return an object that allows a new `Datastore` instance to 

976 communicate with this `Registry`. 

977 

978 Returns 

979 ------- 

980 manager : `DatastoreRegistryBridgeManager` 

981 Object that mediates communication between this `Registry` and its 

982 associated datastores. 

983 """ 

984 return self._datastoreBridges 

985 

986 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

987 """Retrieve datastore locations for a given dataset. 

988 

989 Parameters 

990 ---------- 

991 ref : `DatasetRef` 

992 A reference to the dataset for which to retrieve storage 

993 information. 

994 

995 Returns 

996 ------- 

997 datastores : `Iterable` [ `str` ] 

998 All the matching datastores holding this dataset. 

999 

1000 Raises 

1001 ------ 

1002 AmbiguousDatasetError 

1003 Raised if ``ref.id`` is `None`. 

1004 """ 

1005 return self._datastoreBridges.findDatastores(ref) 

1006 

1007 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1008 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

1009 **kwargs: Any) -> DataCoordinate: 

1010 """Expand a dimension-based data ID to include additional information. 

1011 

1012 Parameters 

1013 ---------- 

1014 dataId : `DataCoordinate` or `dict`, optional 

1015 Data ID to be expanded; augmented and overridden by ``kwds``. 

1016 graph : `DimensionGraph`, optional 

1017 Set of dimensions for the expanded ID. If `None`, the dimensions 

1018 will be inferred from the keys of ``dataId`` and ``kwds``. 

1019 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1020 are silently ignored, providing a way to extract and expand a 

1021 subset of a data ID. 

1022 records : `Mapping` [`str`, `DimensionRecord`], optional 

1023 Dimension record data to use before querying the database for that 

1024 data, keyed by element name. 

1025 **kwargs 

1026 Additional keywords are treated like additional key-value pairs for 

1027 ``dataId``, extending and overriding 

1028 

1029 Returns 

1030 ------- 

1031 expanded : `DataCoordinate` 

1032 A data ID that includes full metadata for all of the dimensions it 

1033 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

1034 ``expanded.hasFull()`` both return `True`. 

1035 """ 

1036 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

1037 if standardized.hasRecords(): 

1038 return standardized 

1039 if records is None: 

1040 records = {} 

1041 elif isinstance(records, NamedKeyMapping): 

1042 records = records.byName() 

1043 else: 

1044 records = dict(records) 

1045 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

1046 records.update(dataId.records.byName()) 

1047 keys = standardized.byName() 

1048 for element in standardized.graph.primaryKeyTraversalOrder: 

1049 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1050 if record is ...: 

1051 if isinstance(element, Dimension) and keys.get(element.name) is None: 

1052 if element in standardized.graph.required: 

1053 raise LookupError( 

1054 f"No value or null value for required dimension {element.name}." 

1055 ) 

1056 keys[element.name] = None 

1057 record = None 

1058 else: 

1059 storage = self._dimensions[element] 

1060 dataIdSet = DataCoordinateIterable.fromScalar( 

1061 DataCoordinate.standardize(keys, graph=element.graph) 

1062 ) 

1063 fetched = tuple(storage.fetch(dataIdSet)) 

1064 try: 

1065 (record,) = fetched 

1066 except ValueError: 

1067 record = None 

1068 records[element.name] = record 

1069 if record is not None: 

1070 for d in element.implied: 

1071 value = getattr(record, d.name) 

1072 if keys.setdefault(d.name, value) != value: 

1073 raise InconsistentDataIdError( 

1074 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

1075 f"but {element.name} implies {d.name}={value!r}." 

1076 ) 

1077 else: 

1078 if element in standardized.graph.required: 

1079 raise LookupError( 

1080 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1081 ) 

1082 if element.alwaysJoin: 

1083 raise InconsistentDataIdError( 

1084 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1085 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1086 "related." 

1087 ) 

1088 for d in element.implied: 

1089 keys.setdefault(d.name, None) 

1090 records.setdefault(d.name, None) 

1091 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

1092 

1093 def insertDimensionData(self, element: Union[DimensionElement, str], 

1094 *data: Union[Mapping[str, Any], DimensionRecord], 

1095 conform: bool = True) -> None: 

1096 """Insert one or more dimension records into the database. 

1097 

1098 Parameters 

1099 ---------- 

1100 element : `DimensionElement` or `str` 

1101 The `DimensionElement` or name thereof that identifies the table 

1102 records will be inserted into. 

1103 data : `dict` or `DimensionRecord` (variadic) 

1104 One or more records to insert. 

1105 conform : `bool`, optional 

1106 If `False` (`True` is default) perform no checking or conversions, 

1107 and assume that ``element`` is a `DimensionElement` instance and 

1108 ``data`` is a one or more `DimensionRecord` instances of the 

1109 appropriate subclass. 

1110 """ 

1111 if conform: 

1112 if isinstance(element, str): 

1113 element = self.dimensions[element] 

1114 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1115 for row in data] 

1116 else: 

1117 # Ignore typing since caller said to trust them with conform=False. 

1118 records = data # type: ignore 

1119 storage = self._dimensions[element] # type: ignore 

1120 storage.insert(*records) 

1121 

1122 def syncDimensionData(self, element: Union[DimensionElement, str], 

1123 row: Union[Mapping[str, Any], DimensionRecord], 

1124 conform: bool = True) -> bool: 

1125 """Synchronize the given dimension record with the database, inserting 

1126 if it does not already exist and comparing values if it does. 

1127 

1128 Parameters 

1129 ---------- 

1130 element : `DimensionElement` or `str` 

1131 The `DimensionElement` or name thereof that identifies the table 

1132 records will be inserted into. 

1133 row : `dict` or `DimensionRecord` 

1134 The record to insert. 

1135 conform : `bool`, optional 

1136 If `False` (`True` is default) perform no checking or conversions, 

1137 and assume that ``element`` is a `DimensionElement` instance and 

1138 ``data`` is a one or more `DimensionRecord` instances of the 

1139 appropriate subclass. 

1140 

1141 Returns 

1142 ------- 

1143 inserted : `bool` 

1144 `True` if a new row was inserted, `False` otherwise. 

1145 

1146 Raises 

1147 ------ 

1148 ConflictingDefinitionError 

1149 Raised if the record exists in the database (according to primary 

1150 key lookup) but is inconsistent with the given one. 

1151 """ 

1152 if conform: 

1153 if isinstance(element, str): 

1154 element = self.dimensions[element] 

1155 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1156 else: 

1157 # Ignore typing since caller said to trust them with conform=False. 

1158 record = row # type: ignore 

1159 storage = self._dimensions[element] # type: ignore 

1160 return storage.sync(record) 

1161 

1162 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1163 ) -> Iterator[DatasetType]: 

1164 """Iterate over the dataset types whose names match an expression. 

1165 

1166 Parameters 

1167 ---------- 

1168 expression : `Any`, optional 

1169 An expression that fully or partially identifies the dataset types 

1170 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1171 `...` can be used to return all dataset types, and is the default. 

1172 See :ref:`daf_butler_dataset_type_expressions` for more 

1173 information. 

1174 components : `bool`, optional 

1175 If `True`, apply all expression patterns to component dataset type 

1176 names as well. If `False`, never apply patterns to components. 

1177 If `None` (default), apply patterns to components only if their 

1178 parent datasets were not matched by the expression. 

1179 Fully-specified component datasets (`str` or `DatasetType` 

1180 instances) are always included. 

1181 

1182 Yields 

1183 ------ 

1184 datasetType : `DatasetType` 

1185 A `DatasetType` instance whose name matches ``expression``. 

1186 """ 

1187 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1188 if wildcard is Ellipsis: 

1189 for datasetType in self._datasets: 

1190 # The dataset type can no longer be a component 

1191 yield datasetType 

1192 if components and datasetType.isComposite(): 

1193 # Automatically create the component dataset types 

1194 for component in datasetType.makeAllComponentDatasetTypes(): 

1195 yield component 

1196 return 

1197 done: Set[str] = set() 

1198 for name in wildcard.strings: 

1199 storage = self._datasets.find(name) 

1200 if storage is not None: 

1201 done.add(storage.datasetType.name) 

1202 yield storage.datasetType 

1203 if wildcard.patterns: 

1204 # If components (the argument) is None, we'll save component 

1205 # dataset that we might want to match, but only if their parents 

1206 # didn't get included. 

1207 componentsForLater = [] 

1208 for registeredDatasetType in self._datasets: 

1209 # Components are not stored in registry so expand them here 

1210 allDatasetTypes = [registeredDatasetType] \ 

1211 + registeredDatasetType.makeAllComponentDatasetTypes() 

1212 for datasetType in allDatasetTypes: 

1213 if datasetType.name in done: 

1214 continue 

1215 parentName, componentName = datasetType.nameAndComponent() 

1216 if componentName is not None and not components: 

1217 if components is None and parentName not in done: 

1218 componentsForLater.append(datasetType) 

1219 continue 

1220 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1221 done.add(datasetType.name) 

1222 yield datasetType 

1223 # Go back and try to match saved components. 

1224 for datasetType in componentsForLater: 

1225 parentName, _ = datasetType.nameAndComponent() 

1226 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1227 yield datasetType 

1228 

1229 def queryCollections(self, expression: Any = ..., 

1230 datasetType: Optional[DatasetType] = None, 

1231 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1232 flattenChains: bool = False, 

1233 includeChains: Optional[bool] = None) -> Iterator[str]: 

1234 """Iterate over the collections whose names match an expression. 

1235 

1236 Parameters 

1237 ---------- 

1238 expression : `Any`, optional 

1239 An expression that fully or partially identifies the collections 

1240 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1241 `...` can be used to return all collections, and is the default. 

1242 See :ref:`daf_butler_collection_expressions` for more 

1243 information. 

1244 datasetType : `DatasetType`, optional 

1245 If provided, only yield collections that may contain datasets of 

1246 this type. This is a conservative approximation in general; it may 

1247 yield collections that do not have any such datasets. 

1248 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1249 If provided, only yield collections of these types. 

1250 flattenChains : `bool`, optional 

1251 If `True` (`False` is default), recursively yield the child 

1252 collections of matching `~CollectionType.CHAINED` collections. 

1253 includeChains : `bool`, optional 

1254 If `True`, yield records for matching `~CollectionType.CHAINED` 

1255 collections. Default is the opposite of ``flattenChains``: include 

1256 either CHAINED collections or their children, but not both. 

1257 

1258 Yields 

1259 ------ 

1260 collection : `str` 

1261 The name of a collection that matches ``expression``. 

1262 """ 

1263 # Right now the datasetTypes argument is completely ignored, but that 

1264 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

1265 # ticket will take care of that. 

1266 query = CollectionQuery.fromExpression(expression) 

1267 for record in query.iter(self._collections, collectionTypes=frozenset(collectionTypes), 

1268 flattenChains=flattenChains, includeChains=includeChains): 

1269 yield record.name 

1270 

1271 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

1272 """Return a `QueryBuilder` instance capable of constructing and 

1273 managing more complex queries than those obtainable via `Registry` 

1274 interfaces. 

1275 

1276 This is an advanced interface; downstream code should prefer 

1277 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

1278 are sufficient. 

1279 

1280 Parameters 

1281 ---------- 

1282 summary : `queries.QuerySummary` 

1283 Object describing and categorizing the full set of dimensions that 

1284 will be included in the query. 

1285 

1286 Returns 

1287 ------- 

1288 builder : `queries.QueryBuilder` 

1289 Object that can be used to construct and perform advanced queries. 

1290 """ 

1291 return queries.QueryBuilder( 

1292 summary, 

1293 queries.RegistryManagers( 

1294 collections=self._collections, 

1295 dimensions=self._dimensions, 

1296 datasets=self._datasets 

1297 ) 

1298 ) 

1299 

1300 def queryDatasets(self, datasetType: Any, *, 

1301 collections: Any, 

1302 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1303 dataId: Optional[DataId] = None, 

1304 where: Optional[str] = None, 

1305 findFirst: bool = False, 

1306 components: Optional[bool] = None, 

1307 **kwargs: Any) -> queries.DatasetQueryResults: 

1308 """Query for and iterate over dataset references matching user-provided 

1309 criteria. 

1310 

1311 Parameters 

1312 ---------- 

1313 datasetType 

1314 An expression that fully or partially identifies the dataset types 

1315 to be queried. Allowed types include `DatasetType`, `str`, 

1316 `re.Pattern`, and iterables thereof. The special value `...` can 

1317 be used to query all dataset types. See 

1318 :ref:`daf_butler_dataset_type_expressions` for more information. 

1319 collections 

1320 An expression that fully or partially identifies the collections 

1321 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1322 thereof. `...` can be used to find datasets from all 

1323 `~CollectionType.RUN` collections (no other collections are 

1324 necessary, because all datasets are in a ``RUN`` collection). See 

1325 :ref:`daf_butler_collection_expressions` for more information. 

1326 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1327 Dimensions to include in the query (in addition to those used 

1328 to identify the queried dataset type(s)), either to constrain 

1329 the resulting datasets to those for which a matching dimension 

1330 exists, or to relate the dataset type's dimensions to dimensions 

1331 referenced by the ``dataId`` or ``where`` arguments. 

1332 dataId : `dict` or `DataCoordinate`, optional 

1333 A data ID whose key-value pairs are used as equality constraints 

1334 in the query. 

1335 where : `str`, optional 

1336 A string expression similar to a SQL WHERE clause. May involve 

1337 any column of a dimension table or (as a shortcut for the primary 

1338 key column of a dimension table) dimension name. See 

1339 :ref:`daf_butler_dimension_expressions` for more information. 

1340 findFirst : `bool`, optional 

1341 If `True` (`False` is default), for each result data ID, only 

1342 yield one `DatasetRef` of each `DatasetType`, from the first 

1343 collection in which a dataset of that dataset type appears 

1344 (according to the order of ``collections`` passed in). If `True`, 

1345 ``collections`` must not contain regular expressions and may not 

1346 be `...`. 

1347 components : `bool`, optional 

1348 If `True`, apply all dataset expression patterns to component 

1349 dataset type names as well. If `False`, never apply patterns to 

1350 components. If `None` (default), apply patterns to components only 

1351 if their parent datasets were not matched by the expression. 

1352 Fully-specified component datasets (`str` or `DatasetType` 

1353 instances) are always included. 

1354 **kwargs 

1355 Additional keyword arguments are forwarded to 

1356 `DataCoordinate.standardize` when processing the ``dataId`` 

1357 argument (and may be used to provide a constraining data ID even 

1358 when the ``dataId`` argument is `None`). 

1359 

1360 Returns 

1361 ------- 

1362 refs : `queries.DatasetQueryResults` 

1363 Dataset references matching the given query criteria. 

1364 

1365 Raises 

1366 ------ 

1367 TypeError 

1368 Raised when the arguments are incompatible, such as when a 

1369 collection wildcard is passed when ``findFirst`` is `True`. 

1370 

1371 Notes 

1372 ----- 

1373 When multiple dataset types are queried in a single call, the 

1374 results of this operation are equivalent to querying for each dataset 

1375 type separately in turn, and no information about the relationships 

1376 between datasets of different types is included. In contexts where 

1377 that kind of information is important, the recommended pattern is to 

1378 use `queryDataIds` to first obtain data IDs (possibly with the 

1379 desired dataset types and collections passed as constraints to the 

1380 query), and then use multiple (generally much simpler) calls to 

1381 `queryDatasets` with the returned data IDs passed as constraints. 

1382 """ 

1383 # Standardize the collections expression. 

1384 if findFirst: 

1385 collections = CollectionSearch.fromExpression(collections) 

1386 else: 

1387 collections = CollectionQuery.fromExpression(collections) 

1388 # Standardize and expand the data ID provided as a constraint. 

1389 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1390 

1391 # We can only query directly if given a non-component DatasetType 

1392 # instance. If we were given an expression or str or a component 

1393 # DatasetType instance, we'll populate this dict, recurse, and return. 

1394 # If we already have a non-component DatasetType, it will remain None 

1395 # and we'll run the query directly. 

1396 composition: Optional[ 

1397 Dict[ 

1398 DatasetType, # parent dataset type 

1399 List[Optional[str]] # component name, or None for parent 

1400 ] 

1401 ] = None 

1402 if not isinstance(datasetType, DatasetType): 

1403 # We were given a dataset type expression (which may be as simple 

1404 # as a str). Loop over all matching datasets, delegating handling 

1405 # of the `components` argument to queryDatasetTypes, as we populate 

1406 # the composition dict. 

1407 composition = defaultdict(list) 

1408 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1409 parentName, componentName = trueDatasetType.nameAndComponent() 

1410 if componentName is not None: 

1411 parentDatasetType = self.getDatasetType(parentName) 

1412 composition.setdefault(parentDatasetType, []).append(componentName) 

1413 else: 

1414 composition.setdefault(trueDatasetType, []).append(None) 

1415 elif datasetType.isComponent(): 

1416 # We were given a true DatasetType instance, but it's a component. 

1417 # the composition dict will have exactly one item. 

1418 parentName, componentName = datasetType.nameAndComponent() 

1419 parentDatasetType = self.getDatasetType(parentName) 

1420 composition = {parentDatasetType: [componentName]} 

1421 if composition is not None: 

1422 # We need to recurse. Do that once for each parent dataset type. 

1423 chain = [] 

1424 for parentDatasetType, componentNames in composition.items(): 

1425 parentResults = self.queryDatasets( 

1426 parentDatasetType, 

1427 collections=collections, 

1428 dimensions=dimensions, 

1429 dataId=standardizedDataId, 

1430 where=where, 

1431 findFirst=findFirst 

1432 ) 

1433 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

1434 chain.append( 

1435 parentResults.withComponents(componentNames) 

1436 ) 

1437 else: 

1438 # Should only happen if we know there would be no results. 

1439 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

1440 and not parentResults._chain 

1441 return queries.ChainedDatasetQueryResults(chain) 

1442 # If we get here, there's no need to recurse (or we are already 

1443 # recursing; there can only ever be one level of recursion). 

1444 

1445 # The full set of dimensions in the query is the combination of those 

1446 # needed for the DatasetType and those explicitly requested, if any. 

1447 requestedDimensionNames = set(datasetType.dimensions.names) 

1448 if dimensions is not None: 

1449 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1450 # Construct the summary structure needed to construct a QueryBuilder. 

1451 summary = queries.QuerySummary( 

1452 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1453 dataId=standardizedDataId, 

1454 expression=where, 

1455 ) 

1456 builder = self.makeQueryBuilder(summary) 

1457 # Add the dataset subquery to the query, telling the QueryBuilder to 

1458 # include the rank of the selected collection in the results only if we 

1459 # need to findFirst. Note that if any of the collections are 

1460 # actually wildcard expressions, and we've asked for deduplication, 

1461 # this will raise TypeError for us. 

1462 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

1463 return queries.ChainedDatasetQueryResults(()) 

1464 query = builder.finish() 

1465 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

1466 

1467 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1468 dataId: Optional[DataId] = None, 

1469 datasets: Any = None, 

1470 collections: Any = None, 

1471 where: Optional[str] = None, 

1472 components: Optional[bool] = None, 

1473 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

1474 """Query for data IDs matching user-provided criteria. 

1475 

1476 Parameters 

1477 ---------- 

1478 dimensions : `Dimension` or `str`, or iterable thereof 

1479 The dimensions of the data IDs to yield, as either `Dimension` 

1480 instances or `str`. Will be automatically expanded to a complete 

1481 `DimensionGraph`. 

1482 dataId : `dict` or `DataCoordinate`, optional 

1483 A data ID whose key-value pairs are used as equality constraints 

1484 in the query. 

1485 datasets : `Any`, optional 

1486 An expression that fully or partially identifies dataset types 

1487 that should constrain the yielded data IDs. For example, including 

1488 "raw" here would constrain the yielded ``instrument``, 

1489 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1490 those for which at least one "raw" dataset exists in 

1491 ``collections``. Allowed types include `DatasetType`, `str`, 

1492 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1493 expressions, ``...`` is not permitted - it doesn't make sense to 

1494 constrain data IDs on the existence of *all* datasets. 

1495 See :ref:`daf_butler_dataset_type_expressions` for more 

1496 information. 

1497 collections: `Any`, optional 

1498 An expression that fully or partially identifies the collections 

1499 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1500 thereof. `...` can be used to return all collections. Must be 

1501 provided if ``datasets`` is, and is ignored if it is not. See 

1502 :ref:`daf_butler_collection_expressions` for more information. 

1503 where : `str`, optional 

1504 A string expression similar to a SQL WHERE clause. May involve 

1505 any column of a dimension table or (as a shortcut for the primary 

1506 key column of a dimension table) dimension name. See 

1507 :ref:`daf_butler_dimension_expressions` for more information. 

1508 components : `bool`, optional 

1509 If `True`, apply all dataset expression patterns to component 

1510 dataset type names as well. If `False`, never apply patterns to 

1511 components. If `None` (default), apply patterns to components only 

1512 if their parent datasets were not matched by the expression. 

1513 Fully-specified component datasets (`str` or `DatasetType` 

1514 instances) are always included. 

1515 **kwargs 

1516 Additional keyword arguments are forwarded to 

1517 `DataCoordinate.standardize` when processing the ``dataId`` 

1518 argument (and may be used to provide a constraining data ID even 

1519 when the ``dataId`` argument is `None`). 

1520 

1521 Returns 

1522 ------- 

1523 dataIds : `DataCoordinateQueryResults` 

1524 Data IDs matching the given query parameters. These are guaranteed 

1525 to identify all dimensions (`DataCoordinate.hasFull` returns 

1526 `True`), but will not contain `DimensionRecord` objects 

1527 (`DataCoordinate.hasRecords` returns `False`). Call 

1528 `DataCoordinateQueryResults.expanded` on the returned object to 

1529 fetch those (and consider using 

1530 `DataCoordinateQueryResults.materialize` on the returned object 

1531 first if the expected number of rows is very large). See 

1532 documentation for those methods for additional information. 

1533 """ 

1534 dimensions = iterable(dimensions) 

1535 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1536 standardizedDatasetTypes = set() 

1537 requestedDimensions = self.dimensions.extract(dimensions) 

1538 queryDimensionNames = set(requestedDimensions.names) 

1539 if datasets is not None: 

1540 if collections is None: 

1541 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1542 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1543 queryDimensionNames.update(datasetType.dimensions.names) 

1544 # If any matched dataset type is a component, just operate on 

1545 # its parent instead, because Registry doesn't know anything 

1546 # about what components exist, and here (unlike queryDatasets) 

1547 # we don't care about returning them. 

1548 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1549 if componentName is not None: 

1550 datasetType = self.getDatasetType(parentDatasetTypeName) 

1551 standardizedDatasetTypes.add(datasetType) 

1552 # Preprocess collections expression in case the original included 

1553 # single-pass iterators (we'll want to use it multiple times 

1554 # below). 

1555 collections = CollectionQuery.fromExpression(collections) 

1556 

1557 summary = queries.QuerySummary( 

1558 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

1559 dataId=standardizedDataId, 

1560 expression=where, 

1561 ) 

1562 builder = self.makeQueryBuilder(summary) 

1563 for datasetType in standardizedDatasetTypes: 

1564 builder.joinDataset(datasetType, collections, isResult=False) 

1565 query = builder.finish() 

1566 return queries.DataCoordinateQueryResults(self._db, query) 

1567 

1568 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1569 dataId: Optional[DataId] = None, 

1570 datasets: Any = None, 

1571 collections: Any = None, 

1572 where: Optional[str] = None, 

1573 components: Optional[bool] = None, 

1574 **kwargs: Any) -> Iterator[DimensionRecord]: 

1575 """Query for dimension information matching user-provided criteria. 

1576 

1577 Parameters 

1578 ---------- 

1579 element : `DimensionElement` or `str` 

1580 The dimension element to obtain r 

1581 dataId : `dict` or `DataCoordinate`, optional 

1582 A data ID whose key-value pairs are used as equality constraints 

1583 in the query. 

1584 datasets : `Any`, optional 

1585 An expression that fully or partially identifies dataset types 

1586 that should constrain the yielded records. See `queryDataIds` and 

1587 :ref:`daf_butler_dataset_type_expressions` for more information. 

1588 collections: `Any`, optional 

1589 An expression that fully or partially identifies the collections 

1590 to search for datasets. See `queryDataIds` and 

1591 :ref:`daf_butler_collection_expressions` for more information. 

1592 where : `str`, optional 

1593 A string expression similar to a SQL WHERE clause. See 

1594 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more 

1595 information. 

1596 components : `bool`, optional 

1597 Whether to apply dataset expressions to components as well. 

1598 See `queryDataIds` for more information. 

1599 **kwargs 

1600 Additional keyword arguments are forwarded to 

1601 `DataCoordinate.standardize` when processing the ``dataId`` 

1602 argument (and may be used to provide a constraining data ID even 

1603 when the ``dataId`` argument is `None`). 

1604 

1605 Returns 

1606 ------- 

1607 dataIds : `DataCoordinateQueryResults` 

1608 Data IDs matching the given query parameters. 

1609 """ 

1610 if not isinstance(element, DimensionElement): 

1611 element = self.dimensions[element] 

1612 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1613 where=where, components=components, **kwargs) 

1614 return iter(self._dimensions[element].fetch(dataIds)) 

1615 

1616 def queryDatasetAssociations( 

1617 self, 

1618 datasetType: Union[str, DatasetType], 

1619 collections: Any = ..., 

1620 *, 

1621 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1622 flattenChains: bool = False, 

1623 ) -> Iterator[DatasetAssociation]: 

1624 """Iterate over dataset-collection combinations where the dataset is in 

1625 the collection. 

1626 

1627 This method is a temporary placeholder for better support for 

1628 assocation results in `queryDatasets`. It will probably be 

1629 removed in the future, and should be avoided in production code 

1630 whenever possible. 

1631 

1632 Parameters 

1633 ---------- 

1634 datasetType : `DatasetType` or `str` 

1635 A dataset type object or the name of one. 

1636 collections: `Any`, optional 

1637 An expression that fully or partially identifies the collections 

1638 to search for datasets. See `queryCollections` and 

1639 :ref:`daf_butler_collection_expressions` for more information. 

1640 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1641 If provided, only yield associations from collections of these 

1642 types. 

1643 flattenChains : `bool`, optional 

1644 If `True` (default) search in the children of 

1645 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED`` 

1646 collections are ignored. 

1647 

1648 Yields 

1649 ------ 

1650 association : `DatasetAssociation` 

1651 Object representing the relationship beween a single dataset and 

1652 a single collection. 

1653 """ 

1654 collections = CollectionQuery.fromExpression(collections) 

1655 tsRepr = self._db.getTimespanRepresentation() 

1656 if isinstance(datasetType, str): 

1657 storage = self._datasets[datasetType] 

1658 else: 

1659 storage = self._datasets[datasetType.name] 

1660 for collectionRecord in collections.iter(self._collections, 

1661 collectionTypes=frozenset(collectionTypes), 

1662 flattenChains=flattenChains): 

1663 query = storage.select(collectionRecord) 

1664 if query is None: 

1665 continue 

1666 for row in self._db.query(query.combine()): 

1667 dataId = DataCoordinate.fromRequiredValues( 

1668 storage.datasetType.dimensions, 

1669 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1670 ) 

1671 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]] 

1672 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1673 conform=False) 

1674 if collectionRecord.type is CollectionType.CALIBRATION: 

1675 timespan = tsRepr.extract(row) 

1676 else: 

1677 timespan = None 

1678 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1679 

1680 storageClasses: StorageClassFactory 

1681 """All storage classes known to the registry (`StorageClassFactory`). 

1682 """