Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of daf_butler. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (http://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22from __future__ import annotations 

23 

24__all__ = ( 

25 "Registry", 

26) 

27 

28from collections import defaultdict 

29import contextlib 

30import logging 

31from typing import ( 

32 Any, 

33 Dict, 

34 Iterable, 

35 Iterator, 

36 List, 

37 Mapping, 

38 Optional, 

39 Set, 

40 Type, 

41 TYPE_CHECKING, 

42 Union, 

43) 

44 

45import sqlalchemy 

46 

47from lsst.utils import doImport 

48from ..core import ( 

49 ButlerURI, 

50 Config, 

51 DataCoordinate, 

52 DataCoordinateIterable, 

53 DataId, 

54 DatasetAssociation, 

55 DatasetRef, 

56 DatasetType, 

57 ddl, 

58 Dimension, 

59 DimensionConfig, 

60 DimensionElement, 

61 DimensionGraph, 

62 DimensionRecord, 

63 DimensionUniverse, 

64 NamedKeyMapping, 

65 NameLookupMapping, 

66 StorageClassFactory, 

67 Timespan, 

68) 

69from . import queries 

70from ..core.utils import iterable, transactional 

71from ._config import RegistryConfig 

72from ._collectionType import CollectionType 

73from ._exceptions import ConflictingDefinitionError, InconsistentDataIdError, OrphanedRecordError 

74from .wildcards import CategorizedWildcard, CollectionQuery, CollectionSearch, Ellipsis 

75from .interfaces import ChainedCollectionRecord, RunRecord 

76from .versions import ButlerVersionsManager, DigestMismatchError 

77 

78if TYPE_CHECKING: 78 ↛ 79line 78 didn't jump to line 79, because the condition on line 78 was never true

79 from .._butlerConfig import ButlerConfig 

80 from .interfaces import ( 

81 ButlerAttributeManager, 

82 CollectionManager, 

83 Database, 

84 OpaqueTableStorageManager, 

85 DimensionRecordStorageManager, 

86 DatasetRecordStorageManager, 

87 DatastoreRegistryBridgeManager, 

88 ) 

89 

90 

91_LOG = logging.getLogger(__name__) 

92 

93# key for dimensions configuration in attributes table 

94_DIMENSIONS_ATTR = "config:dimensions.json" 

95 

96 

97class Registry: 

98 """Registry interface. 

99 

100 Parameters 

101 ---------- 

102 database : `Database` 

103 Database instance to store Registry. 

104 attributes : `type` 

105 Manager class implementing `ButlerAttributeManager`. 

106 opaque : `type` 

107 Manager class implementing `OpaqueTableStorageManager`. 

108 dimensions : `type` 

109 Manager class implementing `DimensionRecordStorageManager`. 

110 collections : `type` 

111 Manager class implementing `CollectionManager`. 

112 datasets : `type` 

113 Manager class implementing `DatasetRecordStorageManager`. 

114 datastoreBridges : `type` 

115 Manager class implementing `DatastoreRegistryBridgeManager`. 

116 dimensionConfig : `DimensionConfig`, optional 

117 Dimension universe configuration, only used when ``create`` is True. 

118 writeable : `bool`, optional 

119 If True then Registry will support write operations. 

120 create : `bool`, optional 

121 If True then database schema will be initialized, it must be empty 

122 before instantiating Registry. 

123 """ 

124 

125 defaultConfigFile: Optional[str] = None 

126 """Path to configuration defaults. Accessed within the ``configs`` resource 

127 or relative to a search path. Can be None if no defaults specified. 

128 """ 

129 

130 @classmethod 

131 def createFromConfig(cls, config: Optional[Union[RegistryConfig, str]] = None, 

132 dimensionConfig: Optional[Union[DimensionConfig, str]] = None, 

133 butlerRoot: Optional[str] = None) -> Registry: 

134 """Create registry database and return `Registry` instance. 

135 

136 This method initializes database contents, database must be empty 

137 prior to calling this method. 

138 

139 Parameters 

140 ---------- 

141 config : `RegistryConfig` or `str`, optional 

142 Registry configuration, if missing then default configuration will 

143 be loaded from registry.yaml. 

144 dimensionConfig : `DimensionConfig` or `str`, optional 

145 Dimensions configuration, if missing then default configuration 

146 will be loaded from dimensions.yaml. 

147 butlerRoot : `str`, optional 

148 Path to the repository root this `Registry` will manage. 

149 

150 Returns 

151 ------- 

152 registry : `Registry` 

153 A new `Registry` instance. 

154 """ 

155 if isinstance(config, str): 

156 config = RegistryConfig(config) 

157 elif config is None: 

158 config = RegistryConfig() 

159 elif not isinstance(config, RegistryConfig): 

160 raise TypeError(f"Incompatible Registry configuration type: {type(config)}") 

161 config.replaceRoot(butlerRoot) 

162 

163 if isinstance(dimensionConfig, str): 

164 dimensionConfig = DimensionConfig(config) 

165 elif dimensionConfig is None: 

166 dimensionConfig = DimensionConfig() 

167 elif not isinstance(dimensionConfig, DimensionConfig): 

168 raise TypeError(f"Incompatible Dimension configuration type: {type(dimensionConfig)}") 

169 

170 DatabaseClass = config.getDatabaseClass() 

171 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

172 namespace=config.get("namespace")) 

173 attributes = doImport(config["managers", "attributes"]) 

174 opaque = doImport(config["managers", "opaque"]) 

175 dimensions = doImport(config["managers", "dimensions"]) 

176 collections = doImport(config["managers", "collections"]) 

177 datasets = doImport(config["managers", "datasets"]) 

178 datastoreBridges = doImport(config["managers", "datastores"]) 

179 

180 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque, 

181 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

182 dimensionConfig=dimensionConfig, create=True) 

183 

184 @classmethod 

185 def fromConfig(cls, config: Union[ButlerConfig, RegistryConfig, Config, str], 

186 butlerRoot: Optional[Union[str, ButlerURI]] = None, writeable: bool = True) -> Registry: 

187 """Create `Registry` subclass instance from `config`. 

188 

189 Registry database must be inbitialized prior to calling this method. 

190 

191 Parameters 

192 ---------- 

193 config : `ButlerConfig`, `RegistryConfig`, `Config` or `str` 

194 Registry configuration 

195 butlerRoot : `str` or `ButlerURI`, optional 

196 Path to the repository root this `Registry` will manage. 

197 writeable : `bool`, optional 

198 If `True` (default) create a read-write connection to the database. 

199 

200 Returns 

201 ------- 

202 registry : `Registry` (subclass) 

203 A new `Registry` subclass instance. 

204 """ 

205 if not isinstance(config, RegistryConfig): 

206 if isinstance(config, str) or isinstance(config, Config): 

207 config = RegistryConfig(config) 

208 else: 

209 raise ValueError("Incompatible Registry configuration: {}".format(config)) 

210 config.replaceRoot(butlerRoot) 

211 DatabaseClass = config.getDatabaseClass() 

212 database = DatabaseClass.fromUri(str(config.connectionString), origin=config.get("origin", 0), 

213 namespace=config.get("namespace"), writeable=writeable) 

214 attributes = doImport(config["managers", "attributes"]) 

215 opaque = doImport(config["managers", "opaque"]) 

216 dimensions = doImport(config["managers", "dimensions"]) 

217 collections = doImport(config["managers", "collections"]) 

218 datasets = doImport(config["managers", "datasets"]) 

219 datastoreBridges = doImport(config["managers", "datastores"]) 

220 

221 return cls(database, dimensions=dimensions, attributes=attributes, opaque=opaque, 

222 collections=collections, datasets=datasets, datastoreBridges=datastoreBridges, 

223 dimensionConfig=None, writeable=writeable) 

224 

225 def __init__(self, database: Database, *, 

226 attributes: Type[ButlerAttributeManager], 

227 opaque: Type[OpaqueTableStorageManager], 

228 dimensions: Type[DimensionRecordStorageManager], 

229 collections: Type[CollectionManager], 

230 datasets: Type[DatasetRecordStorageManager], 

231 datastoreBridges: Type[DatastoreRegistryBridgeManager], 

232 dimensionConfig: Optional[DimensionConfig] = None, 

233 writeable: bool = True, 

234 create: bool = False): 

235 self._db = database 

236 self.storageClasses = StorageClassFactory() 

237 

238 # With existing registry we have to read dimensions config from 

239 # database before we initialize all other managers. 

240 if dimensionConfig is None: 

241 assert not create, "missing DimensionConfig when create=True" 

242 with self._db.declareStaticTables(create=False) as context: 

243 self._attributes = attributes.initialize(self._db, context) 

244 

245 versions = ButlerVersionsManager( 

246 self._attributes, 

247 dict(attributes=self._attributes) 

248 ) 

249 # verify that configured versions are compatible with schema 

250 versions.checkManagersConfig() 

251 versions.checkManagersVersions(writeable) 

252 

253 # get serialized as a string from database 

254 dimensionsString = self._attributes.get(_DIMENSIONS_ATTR) 

255 if dimensionsString is not None: 

256 dimensionConfig = DimensionConfig(Config.fromString(dimensionsString, format="json")) 

257 else: 

258 raise LookupError(f"Registry attribute {_DIMENSIONS_ATTR} is missing from database") 

259 

260 # make universe 

261 universe = DimensionUniverse(dimensionConfig) 

262 

263 with self._db.declareStaticTables(create=create) as context: 

264 self._attributes = attributes.initialize(self._db, context) 

265 self._dimensions = dimensions.initialize(self._db, context, universe=universe) 

266 self._collections = collections.initialize(self._db, context, dimensions=self._dimensions) 

267 self._datasets = datasets.initialize(self._db, context, 

268 collections=self._collections, 

269 dimensions=self._dimensions) 

270 self._opaque = opaque.initialize(self._db, context) 

271 self._datastoreBridges = datastoreBridges.initialize(self._db, context, 

272 opaque=self._opaque, 

273 datasets=datasets, 

274 universe=self._dimensions.universe) 

275 versions = ButlerVersionsManager( 

276 self._attributes, 

277 dict( 

278 attributes=self._attributes, 

279 opaque=self._opaque, 

280 dimensions=self._dimensions, 

281 collections=self._collections, 

282 datasets=self._datasets, 

283 datastores=self._datastoreBridges, 

284 ) 

285 ) 

286 # store managers and their versions in attributes table 

287 context.addInitializer(lambda db: versions.storeManagersConfig()) 

288 context.addInitializer(lambda db: versions.storeManagersVersions()) 

289 # dump universe config as json into attributes (faster than YAML) 

290 json = dimensionConfig.dump(format="json") 

291 if json is not None: 

292 # Convert Optional[str] to str for mypy 

293 json_str = json 

294 context.addInitializer( 

295 lambda db: self._attributes.set(_DIMENSIONS_ATTR, json_str) 

296 ) 

297 else: 

298 raise RuntimeError("Unexpectedly failed to serialize DimensionConfig to JSON") 

299 

300 if not create: 

301 # verify that configured versions are compatible with schema 

302 versions.checkManagersConfig() 

303 versions.checkManagersVersions(writeable) 

304 try: 

305 versions.checkManagersDigests() 

306 except DigestMismatchError as exc: 

307 # potentially digest mismatch is a serious error but during 

308 # development it could be benign, treat this as warning for 

309 # now. 

310 _LOG.warning(f"Registry schema digest mismatch: {exc}") 

311 

312 self._dimensions.refresh() 

313 self._collections.refresh() 

314 self._datasets.refresh() 

315 

316 def __str__(self) -> str: 

317 return str(self._db) 

318 

319 def __repr__(self) -> str: 

320 return f"Registry({self._db!r}, {self.dimensions!r})" 

321 

322 def isWriteable(self) -> bool: 

323 """Return `True` if this registry allows write operations, and `False` 

324 otherwise. 

325 """ 

326 return self._db.isWriteable() 

327 

328 @property 

329 def dimensions(self) -> DimensionUniverse: 

330 """All dimensions recognized by this `Registry` (`DimensionUniverse`). 

331 """ 

332 return self._dimensions.universe 

333 

334 @contextlib.contextmanager 

335 def transaction(self, *, savepoint: bool = False) -> Iterator[None]: 

336 """Return a context manager that represents a transaction. 

337 """ 

338 try: 

339 with self._db.transaction(savepoint=savepoint): 

340 yield 

341 except BaseException: 

342 # TODO: this clears the caches sometimes when we wouldn't actually 

343 # need to. Can we avoid that? 

344 self._dimensions.clearCaches() 

345 raise 

346 

347 def registerOpaqueTable(self, tableName: str, spec: ddl.TableSpec) -> None: 

348 """Add an opaque (to the `Registry`) table for use by a `Datastore` or 

349 other data repository client. 

350 

351 Opaque table records can be added via `insertOpaqueData`, retrieved via 

352 `fetchOpaqueData`, and removed via `deleteOpaqueData`. 

353 

354 Parameters 

355 ---------- 

356 tableName : `str` 

357 Logical name of the opaque table. This may differ from the 

358 actual name used in the database by a prefix and/or suffix. 

359 spec : `ddl.TableSpec` 

360 Specification for the table to be added. 

361 """ 

362 self._opaque.register(tableName, spec) 

363 

364 @transactional 

365 def insertOpaqueData(self, tableName: str, *data: dict) -> None: 

366 """Insert records into an opaque table. 

367 

368 Parameters 

369 ---------- 

370 tableName : `str` 

371 Logical name of the opaque table. Must match the name used in a 

372 previous call to `registerOpaqueTable`. 

373 data 

374 Each additional positional argument is a dictionary that represents 

375 a single row to be added. 

376 """ 

377 self._opaque[tableName].insert(*data) 

378 

379 def fetchOpaqueData(self, tableName: str, **where: Any) -> Iterator[dict]: 

380 """Retrieve records from an opaque table. 

381 

382 Parameters 

383 ---------- 

384 tableName : `str` 

385 Logical name of the opaque table. Must match the name used in a 

386 previous call to `registerOpaqueTable`. 

387 where 

388 Additional keyword arguments are interpreted as equality 

389 constraints that restrict the returned rows (combined with AND); 

390 keyword arguments are column names and values are the values they 

391 must have. 

392 

393 Yields 

394 ------ 

395 row : `dict` 

396 A dictionary representing a single result row. 

397 """ 

398 yield from self._opaque[tableName].fetch(**where) 

399 

400 @transactional 

401 def deleteOpaqueData(self, tableName: str, **where: Any) -> None: 

402 """Remove records from an opaque table. 

403 

404 Parameters 

405 ---------- 

406 tableName : `str` 

407 Logical name of the opaque table. Must match the name used in a 

408 previous call to `registerOpaqueTable`. 

409 where 

410 Additional keyword arguments are interpreted as equality 

411 constraints that restrict the deleted rows (combined with AND); 

412 keyword arguments are column names and values are the values they 

413 must have. 

414 """ 

415 self._opaque[tableName].delete(**where) 

416 

417 def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED, 

418 doc: Optional[str] = None) -> None: 

419 """Add a new collection if one with the given name does not exist. 

420 

421 Parameters 

422 ---------- 

423 name : `str` 

424 The name of the collection to create. 

425 type : `CollectionType` 

426 Enum value indicating the type of collection to create. 

427 doc : `str`, optional 

428 Documentation string for the collection. 

429 

430 Notes 

431 ----- 

432 This method cannot be called within transactions, as it needs to be 

433 able to perform its own transaction to be concurrent. 

434 """ 

435 self._collections.register(name, type, doc=doc) 

436 

437 def getCollectionType(self, name: str) -> CollectionType: 

438 """Return an enumeration value indicating the type of the given 

439 collection. 

440 

441 Parameters 

442 ---------- 

443 name : `str` 

444 The name of the collection. 

445 

446 Returns 

447 ------- 

448 type : `CollectionType` 

449 Enum value indicating the type of this collection. 

450 

451 Raises 

452 ------ 

453 MissingCollectionError 

454 Raised if no collection with the given name exists. 

455 """ 

456 return self._collections.find(name).type 

457 

458 def registerRun(self, name: str, doc: Optional[str] = None) -> None: 

459 """Add a new run if one with the given name does not exist. 

460 

461 Parameters 

462 ---------- 

463 name : `str` 

464 The name of the run to create. 

465 doc : `str`, optional 

466 Documentation string for the collection. 

467 

468 Notes 

469 ----- 

470 This method cannot be called within transactions, as it needs to be 

471 able to perform its own transaction to be concurrent. 

472 """ 

473 self._collections.register(name, CollectionType.RUN, doc=doc) 

474 

475 @transactional 

476 def removeCollection(self, name: str) -> None: 

477 """Completely remove the given collection. 

478 

479 Parameters 

480 ---------- 

481 name : `str` 

482 The name of the collection to remove. 

483 

484 Raises 

485 ------ 

486 MissingCollectionError 

487 Raised if no collection with the given name exists. 

488 

489 Notes 

490 ----- 

491 If this is a `~CollectionType.RUN` collection, all datasets and quanta 

492 in it are also fully removed. This requires that those datasets be 

493 removed (or at least trashed) from any datastores that hold them first. 

494 

495 A collection may not be deleted as long as it is referenced by a 

496 `~CollectionType.CHAINED` collection; the ``CHAINED`` collection must 

497 be deleted or redefined first. 

498 """ 

499 self._collections.remove(name) 

500 

501 def getCollectionChain(self, parent: str) -> CollectionSearch: 

502 """Return the child collections in a `~CollectionType.CHAINED` 

503 collection. 

504 

505 Parameters 

506 ---------- 

507 parent : `str` 

508 Name of the chained collection. Must have already been added via 

509 a call to `Registry.registerCollection`. 

510 

511 Returns 

512 ------- 

513 children : `CollectionSearch` 

514 An object that defines the search path of the collection. 

515 See :ref:`daf_butler_collection_expressions` for more information. 

516 

517 Raises 

518 ------ 

519 MissingCollectionError 

520 Raised if ``parent`` does not exist in the `Registry`. 

521 TypeError 

522 Raised if ``parent`` does not correspond to a 

523 `~CollectionType.CHAINED` collection. 

524 """ 

525 record = self._collections.find(parent) 

526 if record.type is not CollectionType.CHAINED: 

527 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

528 assert isinstance(record, ChainedCollectionRecord) 

529 return record.children 

530 

531 @transactional 

532 def setCollectionChain(self, parent: str, children: Any) -> None: 

533 """Define or redefine a `~CollectionType.CHAINED` collection. 

534 

535 Parameters 

536 ---------- 

537 parent : `str` 

538 Name of the chained collection. Must have already been added via 

539 a call to `Registry.registerCollection`. 

540 children : `Any` 

541 An expression defining an ordered search of child collections, 

542 generally an iterable of `str`; see 

543 :ref:`daf_butler_collection_expressions` for more information. 

544 

545 Raises 

546 ------ 

547 MissingCollectionError 

548 Raised when any of the given collections do not exist in the 

549 `Registry`. 

550 TypeError 

551 Raised if ``parent`` does not correspond to a 

552 `~CollectionType.CHAINED` collection. 

553 ValueError 

554 Raised if the given collections contains a cycle. 

555 """ 

556 record = self._collections.find(parent) 

557 if record.type is not CollectionType.CHAINED: 

558 raise TypeError(f"Collection '{parent}' has type {record.type.name}, not CHAINED.") 

559 assert isinstance(record, ChainedCollectionRecord) 

560 children = CollectionSearch.fromExpression(children) 

561 if children != record.children: 

562 record.update(self._collections, children) 

563 

564 def getCollectionDocumentation(self, collection: str) -> Optional[str]: 

565 """Retrieve the documentation string for a collection. 

566 

567 Parameters 

568 ---------- 

569 name : `str` 

570 Name of the collection. 

571 

572 Returns 

573 ------- 

574 docs : `str` or `None` 

575 Docstring for the collection with the given name. 

576 """ 

577 return self._collections.getDocumentation(self._collections.find(collection).key) 

578 

579 def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None: 

580 """Set the documentation string for a collection. 

581 

582 Parameters 

583 ---------- 

584 name : `str` 

585 Name of the collection. 

586 docs : `str` or `None` 

587 Docstring for the collection with the given name; will replace any 

588 existing docstring. Passing `None` will remove any existing 

589 docstring. 

590 """ 

591 self._collections.setDocumentation(self._collections.find(collection).key, doc) 

592 

593 def registerDatasetType(self, datasetType: DatasetType) -> bool: 

594 """ 

595 Add a new `DatasetType` to the Registry. 

596 

597 It is not an error to register the same `DatasetType` twice. 

598 

599 Parameters 

600 ---------- 

601 datasetType : `DatasetType` 

602 The `DatasetType` to be added. 

603 

604 Returns 

605 ------- 

606 inserted : `bool` 

607 `True` if ``datasetType`` was inserted, `False` if an identical 

608 existing `DatsetType` was found. Note that in either case the 

609 DatasetType is guaranteed to be defined in the Registry 

610 consistently with the given definition. 

611 

612 Raises 

613 ------ 

614 ValueError 

615 Raised if the dimensions or storage class are invalid. 

616 ConflictingDefinitionError 

617 Raised if this DatasetType is already registered with a different 

618 definition. 

619 

620 Notes 

621 ----- 

622 This method cannot be called within transactions, as it needs to be 

623 able to perform its own transaction to be concurrent. 

624 """ 

625 _, inserted = self._datasets.register(datasetType) 

626 return inserted 

627 

628 def removeDatasetType(self, name: str) -> None: 

629 """Remove the named `DatasetType` from the registry. 

630 

631 .. warning:: 

632 

633 Registry caches the dataset type definitions. This means that 

634 deleting the dataset type definition may result in unexpected 

635 behavior from other butler processes that are active that have 

636 not seen the deletion. 

637 

638 Parameters 

639 ---------- 

640 name : `str` 

641 Name of the type to be removed. 

642 

643 Raises 

644 ------ 

645 lsst.daf.butler.registry.OrphanedRecordError 

646 Raised if an attempt is made to remove the dataset type definition 

647 when there are already datasets associated with it. 

648 

649 Notes 

650 ----- 

651 If the dataset type is not registered the method will return without 

652 action. 

653 """ 

654 self._datasets.remove(name) 

655 

656 def getDatasetType(self, name: str) -> DatasetType: 

657 """Get the `DatasetType`. 

658 

659 Parameters 

660 ---------- 

661 name : `str` 

662 Name of the type. 

663 

664 Returns 

665 ------- 

666 type : `DatasetType` 

667 The `DatasetType` associated with the given name. 

668 

669 Raises 

670 ------ 

671 KeyError 

672 Requested named DatasetType could not be found in registry. 

673 """ 

674 return self._datasets[name].datasetType 

675 

676 def findDataset(self, datasetType: Union[DatasetType, str], dataId: Optional[DataId] = None, *, 

677 collections: Any, timespan: Optional[Timespan] = None, 

678 **kwargs: Any) -> Optional[DatasetRef]: 

679 """Find a dataset given its `DatasetType` and data ID. 

680 

681 This can be used to obtain a `DatasetRef` that permits the dataset to 

682 be read from a `Datastore`. If the dataset is a component and can not 

683 be found using the provided dataset type, a dataset ref for the parent 

684 will be returned instead but with the correct dataset type. 

685 

686 Parameters 

687 ---------- 

688 datasetType : `DatasetType` or `str` 

689 A `DatasetType` or the name of one. 

690 dataId : `dict` or `DataCoordinate`, optional 

691 A `dict`-like object containing the `Dimension` links that identify 

692 the dataset within a collection. 

693 collections 

694 An expression that fully or partially identifies the collections to 

695 search for the dataset; see 

696 :ref:`daf_butler_collection_expressions` for more information. 

697 timespan : `Timespan`, optional 

698 A timespan that the validity range of the dataset must overlap. 

699 If not provided, any `~CollectionType.CALIBRATION` collections 

700 matched by the ``collections`` argument will not be searched. 

701 **kwargs 

702 Additional keyword arguments passed to 

703 `DataCoordinate.standardize` to convert ``dataId`` to a true 

704 `DataCoordinate` or augment an existing one. 

705 

706 Returns 

707 ------- 

708 ref : `DatasetRef` 

709 A reference to the dataset, or `None` if no matching Dataset 

710 was found. 

711 

712 Raises 

713 ------ 

714 LookupError 

715 Raised if one or more data ID keys are missing. 

716 KeyError 

717 Raised if the dataset type does not exist. 

718 MissingCollectionError 

719 Raised if any of ``collections`` does not exist in the registry. 

720 

721 Notes 

722 ----- 

723 This method simply returns `None` and does not raise an exception even 

724 when the set of collections searched is intrinsically incompatible with 

725 the dataset type, e.g. if ``datasetType.isCalibration() is False``, but 

726 only `~CollectionType.CALIBRATION` collections are being searched. 

727 This may make it harder to debug some lookup failures, but the behavior 

728 is intentional; we consider it more important that failed searches are 

729 reported consistently, regardless of the reason, and that adding 

730 additional collections that do not contain a match to the search path 

731 never changes the behavior. 

732 """ 

733 if isinstance(datasetType, DatasetType): 

734 storage = self._datasets[datasetType.name] 

735 else: 

736 storage = self._datasets[datasetType] 

737 dataId = DataCoordinate.standardize(dataId, graph=storage.datasetType.dimensions, 

738 universe=self.dimensions, **kwargs) 

739 collections = CollectionSearch.fromExpression(collections) 

740 for collectionRecord in collections.iter(self._collections): 

741 if (collectionRecord.type is CollectionType.CALIBRATION 

742 and (not storage.datasetType.isCalibration() or timespan is None)): 

743 continue 

744 result = storage.find(collectionRecord, dataId, timespan=timespan) 

745 if result is not None: 

746 return result 

747 

748 return None 

749 

750 @transactional 

751 def insertDatasets(self, datasetType: Union[DatasetType, str], dataIds: Iterable[DataId], 

752 run: str) -> List[DatasetRef]: 

753 """Insert one or more datasets into the `Registry` 

754 

755 This always adds new datasets; to associate existing datasets with 

756 a new collection, use ``associate``. 

757 

758 Parameters 

759 ---------- 

760 datasetType : `DatasetType` or `str` 

761 A `DatasetType` or the name of one. 

762 dataIds : `~collections.abc.Iterable` of `dict` or `DataCoordinate` 

763 Dimension-based identifiers for the new datasets. 

764 run : `str` 

765 The name of the run that produced the datasets. 

766 

767 Returns 

768 ------- 

769 refs : `list` of `DatasetRef` 

770 Resolved `DatasetRef` instances for all given data IDs (in the same 

771 order). 

772 

773 Raises 

774 ------ 

775 ConflictingDefinitionError 

776 If a dataset with the same dataset type and data ID as one of those 

777 given already exists in ``run``. 

778 MissingCollectionError 

779 Raised if ``run`` does not exist in the registry. 

780 """ 

781 if isinstance(datasetType, DatasetType): 

782 storage = self._datasets.find(datasetType.name) 

783 if storage is None: 

784 raise LookupError(f"DatasetType '{datasetType}' has not been registered.") 

785 else: 

786 storage = self._datasets.find(datasetType) 

787 if storage is None: 

788 raise LookupError(f"DatasetType with name '{datasetType}' has not been registered.") 

789 runRecord = self._collections.find(run) 

790 if runRecord.type is not CollectionType.RUN: 

791 raise TypeError(f"Given collection is of type {runRecord.type.name}; RUN collection required.") 

792 assert isinstance(runRecord, RunRecord) 

793 expandedDataIds = [self.expandDataId(dataId, graph=storage.datasetType.dimensions) 

794 for dataId in dataIds] 

795 try: 

796 refs = list(storage.insert(runRecord, expandedDataIds)) 

797 except sqlalchemy.exc.IntegrityError as err: 

798 raise ConflictingDefinitionError(f"A database constraint failure was triggered by inserting " 

799 f"one or more datasets of type {storage.datasetType} into " 

800 f"collection '{run}'. " 

801 f"This probably means a dataset with the same data ID " 

802 f"and dataset type already exists, but it may also mean a " 

803 f"dimension row is missing.") from err 

804 return refs 

805 

806 def getDataset(self, id: int) -> Optional[DatasetRef]: 

807 """Retrieve a Dataset entry. 

808 

809 Parameters 

810 ---------- 

811 id : `int` 

812 The unique identifier for the dataset. 

813 

814 Returns 

815 ------- 

816 ref : `DatasetRef` or `None` 

817 A ref to the Dataset, or `None` if no matching Dataset 

818 was found. 

819 """ 

820 ref = self._datasets.getDatasetRef(id) 

821 if ref is None: 

822 return None 

823 return ref 

824 

825 @transactional 

826 def removeDatasets(self, refs: Iterable[DatasetRef]) -> None: 

827 """Remove datasets from the Registry. 

828 

829 The datasets will be removed unconditionally from all collections, and 

830 any `Quantum` that consumed this dataset will instead be marked with 

831 having a NULL input. `Datastore` records will *not* be deleted; the 

832 caller is responsible for ensuring that the dataset has already been 

833 removed from all Datastores. 

834 

835 Parameters 

836 ---------- 

837 refs : `Iterable` of `DatasetRef` 

838 References to the datasets to be removed. Must include a valid 

839 ``id`` attribute, and should be considered invalidated upon return. 

840 

841 Raises 

842 ------ 

843 AmbiguousDatasetError 

844 Raised if any ``ref.id`` is `None`. 

845 OrphanedRecordError 

846 Raised if any dataset is still present in any `Datastore`. 

847 """ 

848 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

849 storage = self._datasets.find(datasetType.name) 

850 assert storage is not None 

851 try: 

852 storage.delete(refsForType) 

853 except sqlalchemy.exc.IntegrityError as err: 

854 raise OrphanedRecordError("One or more datasets is still " 

855 "present in one or more Datastores.") from err 

856 

857 @transactional 

858 def associate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

859 """Add existing datasets to a `~CollectionType.TAGGED` collection. 

860 

861 If a DatasetRef with the same exact integer ID is already in a 

862 collection nothing is changed. If a `DatasetRef` with the same 

863 `DatasetType` and data ID but with different integer ID 

864 exists in the collection, `ConflictingDefinitionError` is raised. 

865 

866 Parameters 

867 ---------- 

868 collection : `str` 

869 Indicates the collection the datasets should be associated with. 

870 refs : `Iterable` [ `DatasetRef` ] 

871 An iterable of resolved `DatasetRef` instances that already exist 

872 in this `Registry`. 

873 

874 Raises 

875 ------ 

876 ConflictingDefinitionError 

877 If a Dataset with the given `DatasetRef` already exists in the 

878 given collection. 

879 AmbiguousDatasetError 

880 Raised if ``any(ref.id is None for ref in refs)``. 

881 MissingCollectionError 

882 Raised if ``collection`` does not exist in the registry. 

883 TypeError 

884 Raise adding new datasets to the given ``collection`` is not 

885 allowed. 

886 """ 

887 collectionRecord = self._collections.find(collection) 

888 if collectionRecord.type is not CollectionType.TAGGED: 

889 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}, not TAGGED.") 

890 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

891 storage = self._datasets.find(datasetType.name) 

892 assert storage is not None 

893 try: 

894 storage.associate(collectionRecord, refsForType) 

895 except sqlalchemy.exc.IntegrityError as err: 

896 raise ConflictingDefinitionError( 

897 f"Constraint violation while associating dataset of type {datasetType.name} with " 

898 f"collection {collection}. This probably means that one or more datasets with the same " 

899 f"dataset type and data ID already exist in the collection, but it may also indicate " 

900 f"that the datasets do not exist." 

901 ) from err 

902 

903 @transactional 

904 def disassociate(self, collection: str, refs: Iterable[DatasetRef]) -> None: 

905 """Remove existing datasets from a `~CollectionType.TAGGED` collection. 

906 

907 ``collection`` and ``ref`` combinations that are not currently 

908 associated are silently ignored. 

909 

910 Parameters 

911 ---------- 

912 collection : `str` 

913 The collection the datasets should no longer be associated with. 

914 refs : `Iterable` [ `DatasetRef` ] 

915 An iterable of resolved `DatasetRef` instances that already exist 

916 in this `Registry`. 

917 

918 Raises 

919 ------ 

920 AmbiguousDatasetError 

921 Raised if any of the given dataset references is unresolved. 

922 MissingCollectionError 

923 Raised if ``collection`` does not exist in the registry. 

924 TypeError 

925 Raise adding new datasets to the given ``collection`` is not 

926 allowed. 

927 """ 

928 collectionRecord = self._collections.find(collection) 

929 if collectionRecord.type is not CollectionType.TAGGED: 

930 raise TypeError(f"Collection '{collection}' has type {collectionRecord.type.name}; " 

931 "expected TAGGED.") 

932 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

933 storage = self._datasets.find(datasetType.name) 

934 assert storage is not None 

935 storage.disassociate(collectionRecord, refsForType) 

936 

937 @transactional 

938 def certify(self, collection: str, refs: Iterable[DatasetRef], timespan: Timespan) -> None: 

939 """Associate one or more datasets with a calibration collection and a 

940 validity range within it. 

941 

942 Parameters 

943 ---------- 

944 collection : `str` 

945 The name of an already-registered `~CollectionType.CALIBRATION` 

946 collection. 

947 refs : `Iterable` [ `DatasetRef` ] 

948 Datasets to be associated. 

949 timespan : `Timespan` 

950 The validity range for these datasets within the collection. 

951 

952 Raises 

953 ------ 

954 AmbiguousDatasetError 

955 Raised if any of the given `DatasetRef` instances is unresolved. 

956 ConflictingDefinitionError 

957 Raised if the collection already contains a different dataset with 

958 the same `DatasetType` and data ID and an overlapping validity 

959 range. 

960 TypeError 

961 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

962 collection or if one or more datasets are of a dataset type for 

963 which `DatasetType.isCalibration` returns `False`. 

964 """ 

965 collectionRecord = self._collections.find(collection) 

966 for datasetType, refsForType in DatasetRef.groupByType(refs).items(): 

967 storage = self._datasets[datasetType.name] 

968 storage.certify(collectionRecord, refsForType, timespan) 

969 

970 @transactional 

971 def decertify(self, collection: str, datasetType: Union[str, DatasetType], timespan: Timespan, *, 

972 dataIds: Optional[Iterable[DataId]] = None) -> None: 

973 """Remove or adjust datasets to clear a validity range within a 

974 calibration collection. 

975 

976 Parameters 

977 ---------- 

978 collection : `str` 

979 The name of an already-registered `~CollectionType.CALIBRATION` 

980 collection. 

981 datasetType : `str` or `DatasetType` 

982 Name or `DatasetType` instance for the datasets to be decertified. 

983 timespan : `Timespan`, optional 

984 The validity range to remove datasets from within the collection. 

985 Datasets that overlap this range but are not contained by it will 

986 have their validity ranges adjusted to not overlap it, which may 

987 split a single dataset validity range into two. 

988 dataIds : `Iterable` [ `DataId` ], optional 

989 Data IDs that should be decertified within the given validity range 

990 If `None`, all data IDs for ``self.datasetType`` will be 

991 decertified. 

992 

993 Raises 

994 ------ 

995 TypeError 

996 Raised if ``collection`` is not a `~CollectionType.CALIBRATION` 

997 collection or if ``datasetType.isCalibration() is False``. 

998 """ 

999 collectionRecord = self._collections.find(collection) 

1000 if isinstance(datasetType, str): 

1001 storage = self._datasets[datasetType] 

1002 else: 

1003 storage = self._datasets[datasetType.name] 

1004 standardizedDataIds = None 

1005 if dataIds is not None: 

1006 standardizedDataIds = [DataCoordinate.standardize(d, graph=storage.datasetType.dimensions) 

1007 for d in dataIds] 

1008 storage.decertify(collectionRecord, timespan, dataIds=standardizedDataIds) 

1009 

1010 def getDatastoreBridgeManager(self) -> DatastoreRegistryBridgeManager: 

1011 """Return an object that allows a new `Datastore` instance to 

1012 communicate with this `Registry`. 

1013 

1014 Returns 

1015 ------- 

1016 manager : `DatastoreRegistryBridgeManager` 

1017 Object that mediates communication between this `Registry` and its 

1018 associated datastores. 

1019 """ 

1020 return self._datastoreBridges 

1021 

1022 def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]: 

1023 """Retrieve datastore locations for a given dataset. 

1024 

1025 Parameters 

1026 ---------- 

1027 ref : `DatasetRef` 

1028 A reference to the dataset for which to retrieve storage 

1029 information. 

1030 

1031 Returns 

1032 ------- 

1033 datastores : `Iterable` [ `str` ] 

1034 All the matching datastores holding this dataset. 

1035 

1036 Raises 

1037 ------ 

1038 AmbiguousDatasetError 

1039 Raised if ``ref.id`` is `None`. 

1040 """ 

1041 return self._datastoreBridges.findDatastores(ref) 

1042 

1043 def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None, 

1044 records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None, 

1045 **kwargs: Any) -> DataCoordinate: 

1046 """Expand a dimension-based data ID to include additional information. 

1047 

1048 Parameters 

1049 ---------- 

1050 dataId : `DataCoordinate` or `dict`, optional 

1051 Data ID to be expanded; augmented and overridden by ``kwds``. 

1052 graph : `DimensionGraph`, optional 

1053 Set of dimensions for the expanded ID. If `None`, the dimensions 

1054 will be inferred from the keys of ``dataId`` and ``kwds``. 

1055 Dimensions that are in ``dataId`` or ``kwds`` but not in ``graph`` 

1056 are silently ignored, providing a way to extract and expand a 

1057 subset of a data ID. 

1058 records : `Mapping` [`str`, `DimensionRecord`], optional 

1059 Dimension record data to use before querying the database for that 

1060 data, keyed by element name. 

1061 **kwargs 

1062 Additional keywords are treated like additional key-value pairs for 

1063 ``dataId``, extending and overriding 

1064 

1065 Returns 

1066 ------- 

1067 expanded : `DataCoordinate` 

1068 A data ID that includes full metadata for all of the dimensions it 

1069 identifieds, i.e. guarantees that ``expanded.hasRecords()`` and 

1070 ``expanded.hasFull()`` both return `True`. 

1071 """ 

1072 standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions, **kwargs) 

1073 if standardized.hasRecords(): 

1074 return standardized 

1075 if records is None: 

1076 records = {} 

1077 elif isinstance(records, NamedKeyMapping): 

1078 records = records.byName() 

1079 else: 

1080 records = dict(records) 

1081 if isinstance(dataId, DataCoordinate) and dataId.hasRecords(): 

1082 records.update(dataId.records.byName()) 

1083 keys = standardized.byName() 

1084 for element in standardized.graph.primaryKeyTraversalOrder: 

1085 record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL 

1086 if record is ...: 

1087 if isinstance(element, Dimension) and keys.get(element.name) is None: 

1088 if element in standardized.graph.required: 

1089 raise LookupError( 

1090 f"No value or null value for required dimension {element.name}." 

1091 ) 

1092 keys[element.name] = None 

1093 record = None 

1094 else: 

1095 storage = self._dimensions[element] 

1096 dataIdSet = DataCoordinateIterable.fromScalar( 

1097 DataCoordinate.standardize(keys, graph=element.graph) 

1098 ) 

1099 fetched = tuple(storage.fetch(dataIdSet)) 

1100 try: 

1101 (record,) = fetched 

1102 except ValueError: 

1103 record = None 

1104 records[element.name] = record 

1105 if record is not None: 

1106 for d in element.implied: 

1107 value = getattr(record, d.name) 

1108 if keys.setdefault(d.name, value) != value: 

1109 raise InconsistentDataIdError( 

1110 f"Data ID {standardized} has {d.name}={keys[d.name]!r}, " 

1111 f"but {element.name} implies {d.name}={value!r}." 

1112 ) 

1113 else: 

1114 if element in standardized.graph.required: 

1115 raise LookupError( 

1116 f"Could not fetch record for required dimension {element.name} via keys {keys}." 

1117 ) 

1118 if element.alwaysJoin: 

1119 raise InconsistentDataIdError( 

1120 f"Could not fetch record for element {element.name} via keys {keys}, ", 

1121 "but it is marked alwaysJoin=True; this means one or more dimensions are not " 

1122 "related." 

1123 ) 

1124 for d in element.implied: 

1125 keys.setdefault(d.name, None) 

1126 records.setdefault(d.name, None) 

1127 return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records) 

1128 

1129 def insertDimensionData(self, element: Union[DimensionElement, str], 

1130 *data: Union[Mapping[str, Any], DimensionRecord], 

1131 conform: bool = True) -> None: 

1132 """Insert one or more dimension records into the database. 

1133 

1134 Parameters 

1135 ---------- 

1136 element : `DimensionElement` or `str` 

1137 The `DimensionElement` or name thereof that identifies the table 

1138 records will be inserted into. 

1139 data : `dict` or `DimensionRecord` (variadic) 

1140 One or more records to insert. 

1141 conform : `bool`, optional 

1142 If `False` (`True` is default) perform no checking or conversions, 

1143 and assume that ``element`` is a `DimensionElement` instance and 

1144 ``data`` is a one or more `DimensionRecord` instances of the 

1145 appropriate subclass. 

1146 """ 

1147 if conform: 

1148 if isinstance(element, str): 

1149 element = self.dimensions[element] 

1150 records = [row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1151 for row in data] 

1152 else: 

1153 # Ignore typing since caller said to trust them with conform=False. 

1154 records = data # type: ignore 

1155 storage = self._dimensions[element] # type: ignore 

1156 storage.insert(*records) 

1157 

1158 def syncDimensionData(self, element: Union[DimensionElement, str], 

1159 row: Union[Mapping[str, Any], DimensionRecord], 

1160 conform: bool = True) -> bool: 

1161 """Synchronize the given dimension record with the database, inserting 

1162 if it does not already exist and comparing values if it does. 

1163 

1164 Parameters 

1165 ---------- 

1166 element : `DimensionElement` or `str` 

1167 The `DimensionElement` or name thereof that identifies the table 

1168 records will be inserted into. 

1169 row : `dict` or `DimensionRecord` 

1170 The record to insert. 

1171 conform : `bool`, optional 

1172 If `False` (`True` is default) perform no checking or conversions, 

1173 and assume that ``element`` is a `DimensionElement` instance and 

1174 ``data`` is a one or more `DimensionRecord` instances of the 

1175 appropriate subclass. 

1176 

1177 Returns 

1178 ------- 

1179 inserted : `bool` 

1180 `True` if a new row was inserted, `False` otherwise. 

1181 

1182 Raises 

1183 ------ 

1184 ConflictingDefinitionError 

1185 Raised if the record exists in the database (according to primary 

1186 key lookup) but is inconsistent with the given one. 

1187 """ 

1188 if conform: 

1189 if isinstance(element, str): 

1190 element = self.dimensions[element] 

1191 record = row if isinstance(row, DimensionRecord) else element.RecordClass(**row) 

1192 else: 

1193 # Ignore typing since caller said to trust them with conform=False. 

1194 record = row # type: ignore 

1195 storage = self._dimensions[element] # type: ignore 

1196 return storage.sync(record) 

1197 

1198 def queryDatasetTypes(self, expression: Any = ..., *, components: Optional[bool] = None 

1199 ) -> Iterator[DatasetType]: 

1200 """Iterate over the dataset types whose names match an expression. 

1201 

1202 Parameters 

1203 ---------- 

1204 expression : `Any`, optional 

1205 An expression that fully or partially identifies the dataset types 

1206 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1207 `...` can be used to return all dataset types, and is the default. 

1208 See :ref:`daf_butler_dataset_type_expressions` for more 

1209 information. 

1210 components : `bool`, optional 

1211 If `True`, apply all expression patterns to component dataset type 

1212 names as well. If `False`, never apply patterns to components. 

1213 If `None` (default), apply patterns to components only if their 

1214 parent datasets were not matched by the expression. 

1215 Fully-specified component datasets (`str` or `DatasetType` 

1216 instances) are always included. 

1217 

1218 Yields 

1219 ------ 

1220 datasetType : `DatasetType` 

1221 A `DatasetType` instance whose name matches ``expression``. 

1222 """ 

1223 wildcard = CategorizedWildcard.fromExpression(expression, coerceUnrecognized=lambda d: d.name) 

1224 if wildcard is Ellipsis: 

1225 for datasetType in self._datasets: 

1226 # The dataset type can no longer be a component 

1227 yield datasetType 

1228 if components: 

1229 # Automatically create the component dataset types 

1230 try: 

1231 componentsForDatasetType = datasetType.makeAllComponentDatasetTypes() 

1232 except KeyError as err: 

1233 _LOG.warning(f"Could not load storage class {err} for {datasetType.name}; " 

1234 "if it has components they will not be included in query results.") 

1235 else: 

1236 yield from componentsForDatasetType 

1237 return 

1238 done: Set[str] = set() 

1239 for name in wildcard.strings: 

1240 storage = self._datasets.find(name) 

1241 if storage is not None: 

1242 done.add(storage.datasetType.name) 

1243 yield storage.datasetType 

1244 if wildcard.patterns: 

1245 # If components (the argument) is None, we'll save component 

1246 # dataset that we might want to match, but only if their parents 

1247 # didn't get included. 

1248 componentsForLater = [] 

1249 for registeredDatasetType in self._datasets: 

1250 # Components are not stored in registry so expand them here 

1251 allDatasetTypes = [registeredDatasetType] 

1252 try: 

1253 allDatasetTypes.extend(registeredDatasetType.makeAllComponentDatasetTypes()) 

1254 except KeyError as err: 

1255 _LOG.warning(f"Could not load storage class {err} for {registeredDatasetType.name}; " 

1256 "if it has components they will not be included in query results.") 

1257 for datasetType in allDatasetTypes: 

1258 if datasetType.name in done: 

1259 continue 

1260 parentName, componentName = datasetType.nameAndComponent() 

1261 if componentName is not None and not components: 

1262 if components is None and parentName not in done: 

1263 componentsForLater.append(datasetType) 

1264 continue 

1265 if any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1266 done.add(datasetType.name) 

1267 yield datasetType 

1268 # Go back and try to match saved components. 

1269 for datasetType in componentsForLater: 

1270 parentName, _ = datasetType.nameAndComponent() 

1271 if parentName not in done and any(p.fullmatch(datasetType.name) for p in wildcard.patterns): 

1272 yield datasetType 

1273 

1274 def queryCollections(self, expression: Any = ..., 

1275 datasetType: Optional[DatasetType] = None, 

1276 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1277 flattenChains: bool = False, 

1278 includeChains: Optional[bool] = None) -> Iterator[str]: 

1279 """Iterate over the collections whose names match an expression. 

1280 

1281 Parameters 

1282 ---------- 

1283 expression : `Any`, optional 

1284 An expression that fully or partially identifies the collections 

1285 to return, such as a `str`, `re.Pattern`, or iterable thereof. 

1286 `...` can be used to return all collections, and is the default. 

1287 See :ref:`daf_butler_collection_expressions` for more 

1288 information. 

1289 datasetType : `DatasetType`, optional 

1290 If provided, only yield collections that may contain datasets of 

1291 this type. This is a conservative approximation in general; it may 

1292 yield collections that do not have any such datasets. 

1293 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1294 If provided, only yield collections of these types. 

1295 flattenChains : `bool`, optional 

1296 If `True` (`False` is default), recursively yield the child 

1297 collections of matching `~CollectionType.CHAINED` collections. 

1298 includeChains : `bool`, optional 

1299 If `True`, yield records for matching `~CollectionType.CHAINED` 

1300 collections. Default is the opposite of ``flattenChains``: include 

1301 either CHAINED collections or their children, but not both. 

1302 

1303 Yields 

1304 ------ 

1305 collection : `str` 

1306 The name of a collection that matches ``expression``. 

1307 """ 

1308 # Right now the datasetTypes argument is completely ignored, but that 

1309 # is consistent with its [lack of] guarantees. DM-24939 or a follow-up 

1310 # ticket will take care of that. 

1311 query = CollectionQuery.fromExpression(expression) 

1312 for record in query.iter(self._collections, collectionTypes=frozenset(collectionTypes), 

1313 flattenChains=flattenChains, includeChains=includeChains): 

1314 yield record.name 

1315 

1316 def makeQueryBuilder(self, summary: queries.QuerySummary) -> queries.QueryBuilder: 

1317 """Return a `QueryBuilder` instance capable of constructing and 

1318 managing more complex queries than those obtainable via `Registry` 

1319 interfaces. 

1320 

1321 This is an advanced interface; downstream code should prefer 

1322 `Registry.queryDataIds` and `Registry.queryDatasets` whenever those 

1323 are sufficient. 

1324 

1325 Parameters 

1326 ---------- 

1327 summary : `queries.QuerySummary` 

1328 Object describing and categorizing the full set of dimensions that 

1329 will be included in the query. 

1330 

1331 Returns 

1332 ------- 

1333 builder : `queries.QueryBuilder` 

1334 Object that can be used to construct and perform advanced queries. 

1335 """ 

1336 return queries.QueryBuilder( 

1337 summary, 

1338 queries.RegistryManagers( 

1339 collections=self._collections, 

1340 dimensions=self._dimensions, 

1341 datasets=self._datasets, 

1342 TimespanReprClass=self._db.getTimespanRepresentation(), 

1343 ), 

1344 ) 

1345 

1346 def queryDatasets(self, datasetType: Any, *, 

1347 collections: Any, 

1348 dimensions: Optional[Iterable[Union[Dimension, str]]] = None, 

1349 dataId: Optional[DataId] = None, 

1350 where: Optional[str] = None, 

1351 findFirst: bool = False, 

1352 components: Optional[bool] = None, 

1353 bind: Optional[Mapping[str, Any]] = None, 

1354 check: bool = True, 

1355 **kwargs: Any) -> queries.DatasetQueryResults: 

1356 """Query for and iterate over dataset references matching user-provided 

1357 criteria. 

1358 

1359 Parameters 

1360 ---------- 

1361 datasetType 

1362 An expression that fully or partially identifies the dataset types 

1363 to be queried. Allowed types include `DatasetType`, `str`, 

1364 `re.Pattern`, and iterables thereof. The special value `...` can 

1365 be used to query all dataset types. See 

1366 :ref:`daf_butler_dataset_type_expressions` for more information. 

1367 collections 

1368 An expression that fully or partially identifies the collections 

1369 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1370 thereof. `...` can be used to find datasets from all 

1371 `~CollectionType.RUN` collections (no other collections are 

1372 necessary, because all datasets are in a ``RUN`` collection). See 

1373 :ref:`daf_butler_collection_expressions` for more information. 

1374 dimensions : `~collections.abc.Iterable` of `Dimension` or `str` 

1375 Dimensions to include in the query (in addition to those used 

1376 to identify the queried dataset type(s)), either to constrain 

1377 the resulting datasets to those for which a matching dimension 

1378 exists, or to relate the dataset type's dimensions to dimensions 

1379 referenced by the ``dataId`` or ``where`` arguments. 

1380 dataId : `dict` or `DataCoordinate`, optional 

1381 A data ID whose key-value pairs are used as equality constraints 

1382 in the query. 

1383 where : `str`, optional 

1384 A string expression similar to a SQL WHERE clause. May involve 

1385 any column of a dimension table or (as a shortcut for the primary 

1386 key column of a dimension table) dimension name. See 

1387 :ref:`daf_butler_dimension_expressions` for more information. 

1388 findFirst : `bool`, optional 

1389 If `True` (`False` is default), for each result data ID, only 

1390 yield one `DatasetRef` of each `DatasetType`, from the first 

1391 collection in which a dataset of that dataset type appears 

1392 (according to the order of ``collections`` passed in). If `True`, 

1393 ``collections`` must not contain regular expressions and may not 

1394 be `...`. 

1395 components : `bool`, optional 

1396 If `True`, apply all dataset expression patterns to component 

1397 dataset type names as well. If `False`, never apply patterns to 

1398 components. If `None` (default), apply patterns to components only 

1399 if their parent datasets were not matched by the expression. 

1400 Fully-specified component datasets (`str` or `DatasetType` 

1401 instances) are always included. 

1402 bind : `Mapping`, optional 

1403 Mapping containing literal values that should be injected into the 

1404 ``where`` expression, keyed by the identifiers they replace. 

1405 check : `bool`, optional 

1406 If `True` (default) check the query for consistency before 

1407 executing it. This may reject some valid queries that resemble 

1408 common mistakes (e.g. queries for visits without specifying an 

1409 instrument). 

1410 **kwargs 

1411 Additional keyword arguments are forwarded to 

1412 `DataCoordinate.standardize` when processing the ``dataId`` 

1413 argument (and may be used to provide a constraining data ID even 

1414 when the ``dataId`` argument is `None`). 

1415 

1416 Returns 

1417 ------- 

1418 refs : `queries.DatasetQueryResults` 

1419 Dataset references matching the given query criteria. 

1420 

1421 Raises 

1422 ------ 

1423 TypeError 

1424 Raised when the arguments are incompatible, such as when a 

1425 collection wildcard is passed when ``findFirst`` is `True`. 

1426 

1427 Notes 

1428 ----- 

1429 When multiple dataset types are queried in a single call, the 

1430 results of this operation are equivalent to querying for each dataset 

1431 type separately in turn, and no information about the relationships 

1432 between datasets of different types is included. In contexts where 

1433 that kind of information is important, the recommended pattern is to 

1434 use `queryDataIds` to first obtain data IDs (possibly with the 

1435 desired dataset types and collections passed as constraints to the 

1436 query), and then use multiple (generally much simpler) calls to 

1437 `queryDatasets` with the returned data IDs passed as constraints. 

1438 """ 

1439 # Standardize the collections expression. 

1440 if findFirst: 

1441 collections = CollectionSearch.fromExpression(collections) 

1442 else: 

1443 collections = CollectionQuery.fromExpression(collections) 

1444 # Standardize and expand the data ID provided as a constraint. 

1445 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1446 

1447 # We can only query directly if given a non-component DatasetType 

1448 # instance. If we were given an expression or str or a component 

1449 # DatasetType instance, we'll populate this dict, recurse, and return. 

1450 # If we already have a non-component DatasetType, it will remain None 

1451 # and we'll run the query directly. 

1452 composition: Optional[ 

1453 Dict[ 

1454 DatasetType, # parent dataset type 

1455 List[Optional[str]] # component name, or None for parent 

1456 ] 

1457 ] = None 

1458 if not isinstance(datasetType, DatasetType): 

1459 # We were given a dataset type expression (which may be as simple 

1460 # as a str). Loop over all matching datasets, delegating handling 

1461 # of the `components` argument to queryDatasetTypes, as we populate 

1462 # the composition dict. 

1463 composition = defaultdict(list) 

1464 for trueDatasetType in self.queryDatasetTypes(datasetType, components=components): 

1465 parentName, componentName = trueDatasetType.nameAndComponent() 

1466 if componentName is not None: 

1467 parentDatasetType = self.getDatasetType(parentName) 

1468 composition.setdefault(parentDatasetType, []).append(componentName) 

1469 else: 

1470 composition.setdefault(trueDatasetType, []).append(None) 

1471 elif datasetType.isComponent(): 

1472 # We were given a true DatasetType instance, but it's a component. 

1473 # the composition dict will have exactly one item. 

1474 parentName, componentName = datasetType.nameAndComponent() 

1475 parentDatasetType = self.getDatasetType(parentName) 

1476 composition = {parentDatasetType: [componentName]} 

1477 if composition is not None: 

1478 # We need to recurse. Do that once for each parent dataset type. 

1479 chain = [] 

1480 for parentDatasetType, componentNames in composition.items(): 

1481 parentResults = self.queryDatasets( 

1482 parentDatasetType, 

1483 collections=collections, 

1484 dimensions=dimensions, 

1485 dataId=standardizedDataId, 

1486 where=where, 

1487 findFirst=findFirst, 

1488 check=check, 

1489 ) 

1490 if isinstance(parentResults, queries.ParentDatasetQueryResults): 

1491 chain.append( 

1492 parentResults.withComponents(componentNames) 

1493 ) 

1494 else: 

1495 # Should only happen if we know there would be no results. 

1496 assert isinstance(parentResults, queries.ChainedDatasetQueryResults) \ 

1497 and not parentResults._chain 

1498 return queries.ChainedDatasetQueryResults(chain) 

1499 # If we get here, there's no need to recurse (or we are already 

1500 # recursing; there can only ever be one level of recursion). 

1501 

1502 # The full set of dimensions in the query is the combination of those 

1503 # needed for the DatasetType and those explicitly requested, if any. 

1504 requestedDimensionNames = set(datasetType.dimensions.names) 

1505 if dimensions is not None: 

1506 requestedDimensionNames.update(self.dimensions.extract(dimensions).names) 

1507 # Construct the summary structure needed to construct a QueryBuilder. 

1508 summary = queries.QuerySummary( 

1509 requested=DimensionGraph(self.dimensions, names=requestedDimensionNames), 

1510 dataId=standardizedDataId, 

1511 expression=where, 

1512 bind=bind, 

1513 check=check, 

1514 ) 

1515 builder = self.makeQueryBuilder(summary) 

1516 # Add the dataset subquery to the query, telling the QueryBuilder to 

1517 # include the rank of the selected collection in the results only if we 

1518 # need to findFirst. Note that if any of the collections are 

1519 # actually wildcard expressions, and we've asked for deduplication, 

1520 # this will raise TypeError for us. 

1521 if not builder.joinDataset(datasetType, collections, isResult=True, findFirst=findFirst): 

1522 return queries.ChainedDatasetQueryResults(()) 

1523 query = builder.finish() 

1524 return queries.ParentDatasetQueryResults(self._db, query, components=[None]) 

1525 

1526 def queryDataIds(self, dimensions: Union[Iterable[Union[Dimension, str]], Dimension, str], *, 

1527 dataId: Optional[DataId] = None, 

1528 datasets: Any = None, 

1529 collections: Any = None, 

1530 where: Optional[str] = None, 

1531 components: Optional[bool] = None, 

1532 bind: Optional[Mapping[str, Any]] = None, 

1533 check: bool = True, 

1534 **kwargs: Any) -> queries.DataCoordinateQueryResults: 

1535 """Query for data IDs matching user-provided criteria. 

1536 

1537 Parameters 

1538 ---------- 

1539 dimensions : `Dimension` or `str`, or iterable thereof 

1540 The dimensions of the data IDs to yield, as either `Dimension` 

1541 instances or `str`. Will be automatically expanded to a complete 

1542 `DimensionGraph`. 

1543 dataId : `dict` or `DataCoordinate`, optional 

1544 A data ID whose key-value pairs are used as equality constraints 

1545 in the query. 

1546 datasets : `Any`, optional 

1547 An expression that fully or partially identifies dataset types 

1548 that should constrain the yielded data IDs. For example, including 

1549 "raw" here would constrain the yielded ``instrument``, 

1550 ``exposure``, ``detector``, and ``physical_filter`` values to only 

1551 those for which at least one "raw" dataset exists in 

1552 ``collections``. Allowed types include `DatasetType`, `str`, 

1553 `re.Pattern`, and iterables thereof. Unlike other dataset type 

1554 expressions, ``...`` is not permitted - it doesn't make sense to 

1555 constrain data IDs on the existence of *all* datasets. 

1556 See :ref:`daf_butler_dataset_type_expressions` for more 

1557 information. 

1558 collections: `Any`, optional 

1559 An expression that fully or partially identifies the collections 

1560 to search for datasets, such as a `str`, `re.Pattern`, or iterable 

1561 thereof. `...` can be used to return all collections. Must be 

1562 provided if ``datasets`` is, and is ignored if it is not. See 

1563 :ref:`daf_butler_collection_expressions` for more information. 

1564 where : `str`, optional 

1565 A string expression similar to a SQL WHERE clause. May involve 

1566 any column of a dimension table or (as a shortcut for the primary 

1567 key column of a dimension table) dimension name. See 

1568 :ref:`daf_butler_dimension_expressions` for more information. 

1569 components : `bool`, optional 

1570 If `True`, apply all dataset expression patterns to component 

1571 dataset type names as well. If `False`, never apply patterns to 

1572 components. If `None` (default), apply patterns to components only 

1573 if their parent datasets were not matched by the expression. 

1574 Fully-specified component datasets (`str` or `DatasetType` 

1575 instances) are always included. 

1576 bind : `Mapping`, optional 

1577 Mapping containing literal values that should be injected into the 

1578 ``where`` expression, keyed by the identifiers they replace. 

1579 check : `bool`, optional 

1580 If `True` (default) check the query for consistency before 

1581 executing it. This may reject some valid queries that resemble 

1582 common mistakes (e.g. queries for visits without specifying an 

1583 instrument). 

1584 **kwargs 

1585 Additional keyword arguments are forwarded to 

1586 `DataCoordinate.standardize` when processing the ``dataId`` 

1587 argument (and may be used to provide a constraining data ID even 

1588 when the ``dataId`` argument is `None`). 

1589 

1590 Returns 

1591 ------- 

1592 dataIds : `DataCoordinateQueryResults` 

1593 Data IDs matching the given query parameters. These are guaranteed 

1594 to identify all dimensions (`DataCoordinate.hasFull` returns 

1595 `True`), but will not contain `DimensionRecord` objects 

1596 (`DataCoordinate.hasRecords` returns `False`). Call 

1597 `DataCoordinateQueryResults.expanded` on the returned object to 

1598 fetch those (and consider using 

1599 `DataCoordinateQueryResults.materialize` on the returned object 

1600 first if the expected number of rows is very large). See 

1601 documentation for those methods for additional information. 

1602 """ 

1603 dimensions = iterable(dimensions) 

1604 standardizedDataId = self.expandDataId(dataId, **kwargs) 

1605 standardizedDatasetTypes = set() 

1606 requestedDimensions = self.dimensions.extract(dimensions) 

1607 queryDimensionNames = set(requestedDimensions.names) 

1608 if datasets is not None: 

1609 if collections is None: 

1610 raise TypeError("Cannot pass 'datasets' without 'collections'.") 

1611 for datasetType in self.queryDatasetTypes(datasets, components=components): 

1612 queryDimensionNames.update(datasetType.dimensions.names) 

1613 # If any matched dataset type is a component, just operate on 

1614 # its parent instead, because Registry doesn't know anything 

1615 # about what components exist, and here (unlike queryDatasets) 

1616 # we don't care about returning them. 

1617 parentDatasetTypeName, componentName = datasetType.nameAndComponent() 

1618 if componentName is not None: 

1619 datasetType = self.getDatasetType(parentDatasetTypeName) 

1620 standardizedDatasetTypes.add(datasetType) 

1621 # Preprocess collections expression in case the original included 

1622 # single-pass iterators (we'll want to use it multiple times 

1623 # below). 

1624 collections = CollectionQuery.fromExpression(collections) 

1625 

1626 summary = queries.QuerySummary( 

1627 requested=DimensionGraph(self.dimensions, names=queryDimensionNames), 

1628 dataId=standardizedDataId, 

1629 expression=where, 

1630 bind=bind, 

1631 check=check, 

1632 ) 

1633 builder = self.makeQueryBuilder(summary) 

1634 for datasetType in standardizedDatasetTypes: 

1635 builder.joinDataset(datasetType, collections, isResult=False) 

1636 query = builder.finish() 

1637 return queries.DataCoordinateQueryResults(self._db, query) 

1638 

1639 def queryDimensionRecords(self, element: Union[DimensionElement, str], *, 

1640 dataId: Optional[DataId] = None, 

1641 datasets: Any = None, 

1642 collections: Any = None, 

1643 where: Optional[str] = None, 

1644 components: Optional[bool] = None, 

1645 bind: Optional[Mapping[str, Any]] = None, 

1646 check: bool = True, 

1647 **kwargs: Any) -> Iterator[DimensionRecord]: 

1648 """Query for dimension information matching user-provided criteria. 

1649 

1650 Parameters 

1651 ---------- 

1652 element : `DimensionElement` or `str` 

1653 The dimension element to obtain r 

1654 dataId : `dict` or `DataCoordinate`, optional 

1655 A data ID whose key-value pairs are used as equality constraints 

1656 in the query. 

1657 datasets : `Any`, optional 

1658 An expression that fully or partially identifies dataset types 

1659 that should constrain the yielded records. See `queryDataIds` and 

1660 :ref:`daf_butler_dataset_type_expressions` for more information. 

1661 collections: `Any`, optional 

1662 An expression that fully or partially identifies the collections 

1663 to search for datasets. See `queryDataIds` and 

1664 :ref:`daf_butler_collection_expressions` for more information. 

1665 where : `str`, optional 

1666 A string expression similar to a SQL WHERE clause. See 

1667 `queryDataIds` and :ref:`daf_butler_dimension_expressions` for more 

1668 information. 

1669 components : `bool`, optional 

1670 Whether to apply dataset expressions to components as well. 

1671 See `queryDataIds` for more information. 

1672 bind : `Mapping`, optional 

1673 Mapping containing literal values that should be injected into the 

1674 ``where`` expression, keyed by the identifiers they replace. 

1675 check : `bool`, optional 

1676 If `True` (default) check the query for consistency before 

1677 executing it. This may reject some valid queries that resemble 

1678 common mistakes (e.g. queries for visits without specifying an 

1679 instrument). 

1680 **kwargs 

1681 Additional keyword arguments are forwarded to 

1682 `DataCoordinate.standardize` when processing the ``dataId`` 

1683 argument (and may be used to provide a constraining data ID even 

1684 when the ``dataId`` argument is `None`). 

1685 

1686 Returns 

1687 ------- 

1688 dataIds : `DataCoordinateQueryResults` 

1689 Data IDs matching the given query parameters. 

1690 """ 

1691 if not isinstance(element, DimensionElement): 

1692 element = self.dimensions[element] 

1693 dataIds = self.queryDataIds(element.graph, dataId=dataId, datasets=datasets, collections=collections, 

1694 where=where, components=components, bind=bind, check=check, **kwargs) 

1695 return iter(self._dimensions[element].fetch(dataIds)) 

1696 

1697 def queryDatasetAssociations( 

1698 self, 

1699 datasetType: Union[str, DatasetType], 

1700 collections: Any = ..., 

1701 *, 

1702 collectionTypes: Iterable[CollectionType] = CollectionType.all(), 

1703 flattenChains: bool = False, 

1704 ) -> Iterator[DatasetAssociation]: 

1705 """Iterate over dataset-collection combinations where the dataset is in 

1706 the collection. 

1707 

1708 This method is a temporary placeholder for better support for 

1709 assocation results in `queryDatasets`. It will probably be 

1710 removed in the future, and should be avoided in production code 

1711 whenever possible. 

1712 

1713 Parameters 

1714 ---------- 

1715 datasetType : `DatasetType` or `str` 

1716 A dataset type object or the name of one. 

1717 collections: `Any`, optional 

1718 An expression that fully or partially identifies the collections 

1719 to search for datasets. See `queryCollections` and 

1720 :ref:`daf_butler_collection_expressions` for more information. 

1721 collectionTypes : `AbstractSet` [ `CollectionType` ], optional 

1722 If provided, only yield associations from collections of these 

1723 types. 

1724 flattenChains : `bool`, optional 

1725 If `True` (default) search in the children of 

1726 `~CollectionType.CHAINED` collections. If `False`, ``CHAINED`` 

1727 collections are ignored. 

1728 

1729 Yields 

1730 ------ 

1731 association : `DatasetAssociation` 

1732 Object representing the relationship beween a single dataset and 

1733 a single collection. 

1734 """ 

1735 collections = CollectionQuery.fromExpression(collections) 

1736 TimespanReprClass = self._db.getTimespanRepresentation() 

1737 if isinstance(datasetType, str): 

1738 storage = self._datasets[datasetType] 

1739 else: 

1740 storage = self._datasets[datasetType.name] 

1741 for collectionRecord in collections.iter(self._collections, 

1742 collectionTypes=frozenset(collectionTypes), 

1743 flattenChains=flattenChains): 

1744 query = storage.select(collectionRecord) 

1745 if query is None: 

1746 continue 

1747 for row in self._db.query(query.combine()): 

1748 dataId = DataCoordinate.fromRequiredValues( 

1749 storage.datasetType.dimensions, 

1750 tuple(row[name] for name in storage.datasetType.dimensions.required.names) 

1751 ) 

1752 runRecord = self._collections[row[self._collections.getRunForeignKeyName()]] 

1753 ref = DatasetRef(storage.datasetType, dataId, id=row["id"], run=runRecord.name, 

1754 conform=False) 

1755 if collectionRecord.type is CollectionType.CALIBRATION: 

1756 timespan = TimespanReprClass.extract(row) 

1757 else: 

1758 timespan = None 

1759 yield DatasetAssociation(ref=ref, collection=collectionRecord.name, timespan=timespan) 

1760 

1761 storageClasses: StorageClassFactory 

1762 """All storage classes known to the registry (`StorageClassFactory`). 

1763 """