Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

# This file is part of daf_butler. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

 

""" 

Butler top level classes. 

""" 

 

import os 

import contextlib 

import logging 

 

from lsst.utils import doImport 

from .core.utils import transactional 

from .core.datasets import DatasetRef, DatasetType 

from .core.datastore import Datastore 

from .core.registry import Registry 

from .core.run import Run 

from .core.storageClass import StorageClassFactory 

from .core.config import Config, ConfigSubset 

from .core.butlerConfig import ButlerConfig 

from .core.composites import CompositesMap 

from .core.dimensions import DataId 

 

 

__all__ = ("Butler",) 

 

log = logging.getLogger(__name__) 

 

 

class Butler: 

"""Main entry point for the data access system. 

 

Attributes 

---------- 

config : `str`, `ButlerConfig` or `Config`, optional 

(filename to) configuration. If this is not a `ButlerConfig`, defaults 

will be read. If a `str`, may be the path to a directory containing 

a "butler.yaml" file. 

datastore : `Datastore` 

Datastore to use for storage. 

registry : `Registry` 

Registry to use for lookups. 

 

Parameters 

---------- 

config : `Config` 

Configuration. 

collection : `str`, optional 

Collection to use for all input lookups, overriding 

config["collection"] if provided. 

run : `str`, `Run`, optional 

Collection associated with the `Run` to use for outputs, overriding 

config["run"]. If a `Run` associated with the given Collection does 

not exist, it will be created. If "collection" is None, this 

collection will be used for input lookups as well; if not, it must have 

the same value as "run". 

 

Raises 

------ 

ValueError 

Raised if neither "collection" nor "run" are provided by argument or 

config, or if both are provided and are inconsistent. 

""" 

 

GENERATION = 3 

"""This is a Generation 3 Butler. 

 

This attribute may be removed in the future, once the Generation 2 Butler 

interface has been fully retired; it should only be used in transitional 

code. 

""" 

 

@staticmethod 

def makeRepo(root, config=None, standalone=False, createRegistry=True): 

"""Create an empty data repository by adding a butler.yaml config 

to a repository root directory. 

 

Parameters 

---------- 

root : `str` 

Filesystem path to the root of the new repository. Will be created 

if it does not exist. 

config : `Config`, optional 

Configuration to write to the repository, after setting any 

root-dependent Registry or Datastore config options. If `None`, 

default configuration will be used. 

standalone : `bool` 

If True, write all expanded defaults, not just customized or 

repository-specific settings. 

This (mostly) decouples the repository from the default 

configuration, insulating it from changes to the defaults (which 

may be good or bad, depending on the nature of the changes). 

Future *additions* to the defaults will still be picked up when 

initializing `Butlers` to repos created with ``standalone=True``. 

createRegistry : `bool` 

If `True` create a new Registry. 

 

Note that when ``standalone=False`` (the default), the configuration 

search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

construct the repository should also be used to construct any Butlers 

to it to avoid configuration inconsistencies. 

 

Returns 

------- 

config : `Config` 

The updated `Config` instance written to the repo. 

 

Raises 

------ 

ValueError 

Raised if a ButlerConfig or ConfigSubset is passed instead of a 

regular Config (as these subclasses would make it impossible to 

support ``standalone=False``). 

os.error 

Raised if the directory does not exist, exists but is not a 

directory, or cannot be created. 

""" 

if isinstance(config, (ButlerConfig, ConfigSubset)): 

raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

root = os.path.abspath(root) 

if not os.path.isdir(root): 

os.makedirs(root) 

config = Config(config) 

full = ButlerConfig(config) # this applies defaults 

datastoreClass = doImport(full["datastore", "cls"]) 

datastoreClass.setConfigRoot(root, config, full) 

registryClass = doImport(full["registry", "cls"]) 

registryClass.setConfigRoot(root, config, full) 

if standalone: 

config.merge(full) 

config.dumpToFile(os.path.join(root, "butler.yaml")) 

# Create Registry and populate tables 

registryClass.fromConfig(config, create=createRegistry) 

return config 

 

def __init__(self, config=None, collection=None, run=None): 

# save arguments for pickling 

self._args = (config, collection, run) 

self.config = ButlerConfig(config) 

self.registry = Registry.fromConfig(self.config) 

self.datastore = Datastore.fromConfig(self.config, self.registry) 

self.storageClasses = StorageClassFactory() 

self.storageClasses.addFromConfig(self.config) 

self.composites = CompositesMap(self.config) 

if run is None: 

runCollection = self.config.get("run", None) 

self.run = None 

else: 

if isinstance(run, Run): 

self.run = run 

runCollection = self.run.collection 

else: 

runCollection = run 

self.run = None 

# if run *arg* is not None and collection arg is, use run for collecion. 

if collection is None: 

collection = runCollection 

del run # it's a logic bug if we try to use this variable below 

if collection is None: # didn't get a collection from collection or run *args* 

collection = self.config.get("collection", None) 

if collection is None: # didn't get a collection from config["collection"] 

collection = runCollection # get collection from run found in config 

if collection is None: 

raise ValueError("No run or collection provided.") 

if runCollection is not None and collection != runCollection: 

raise ValueError( 

"Run ({}) and collection ({}) are inconsistent.".format(runCollection, collection) 

) 

self.collection = collection 

if runCollection is not None and self.run is None: 

self.run = self.registry.getRun(collection=runCollection) 

if self.run is None: 

self.run = self.registry.makeRun(runCollection) 

 

def __reduce__(self): 

"""Support pickling. 

""" 

return (Butler, self._args) 

 

def __str__(self): 

return "Butler(collection='{}', datastore='{}', registry='{}')".format( 

self.collection, self.datastore, self.registry) 

 

@contextlib.contextmanager 

def transaction(self): 

"""Context manager supporting `Butler` transactions. 

 

Transactions can be nested. 

""" 

with self.registry.transaction(): 

with self.datastore.transaction(): 

yield 

 

def _standardizeArgs(self, datasetRefOrType, dataId=None, **kwds): 

"""Standardize the arguments passed to several Butler APIs. 

 

Parameters 

---------- 

datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

When `DatasetRef` the `dataId` should be `None`. 

Otherwise the `DatasetType` or name thereof. 

dataId : `dict` or `DataId` 

A `dict` of `Dimension` link name, value pairs that label the 

`DatasetRef` within a Collection. When `None`, a `DatasetRef` 

should be provided as the second argument. 

kwds 

Additional keyword arguments used to augment or construct a 

`DataId`. See `DataId` parameters. 

 

Returns 

------- 

datasetType : `DatasetType` 

A `DatasetType` instance extracted from ``datasetRefOrType``. 

dataId : `dict` or `DataId`, optional 

Argument that can be used (along with ``kwds``) to construct a 

`DataId`. 

 

Notes 

----- 

Butler APIs that conceptually need a DatasetRef also allow passing a 

`DatasetType` (or the name of one) and a `DataId` (or a dict and 

keyword arguments that can be used to construct one) separately. This 

method accepts those arguments and always returns a true `DatasetType` 

and a `DataId` or `dict`. 

 

Standardization of `dict` vs `DataId` is best handled by passing the 

returned ``dataId`` (and ``kwds``) to `Registry` APIs, which are 

generally similarly flexible. 

""" 

if isinstance(datasetRefOrType, DatasetRef): 

if dataId is not None or kwds: 

raise ValueError("DatasetRef given, cannot use dataId as well") 

datasetType = datasetRefOrType.datasetType 

dataId = datasetRefOrType.dataId 

else: 

# Don't check whether DataId is provided, because Registry APIs 

# can usually construct a better error message when it wasn't. 

if isinstance(datasetRefOrType, DatasetType): 

datasetType = datasetRefOrType 

else: 

datasetType = self.registry.getDatasetType(datasetRefOrType) 

return datasetType, dataId 

 

@transactional 

def put(self, obj, datasetRefOrType, dataId=None, producer=None, **kwds): 

"""Store and register a dataset. 

 

Parameters 

---------- 

obj : `object` 

The dataset. 

datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

When `DatasetRef` is provided, ``dataId`` should be `None`. 

Otherwise the `DatasetType` or name thereof. 

dataId : `dict` or `DataId` 

A `dict` of `Dimension` link name, value pairs that label the 

`DatasetRef` within a Collection. When `None`, a `DatasetRef` 

should be provided as the second argument. 

producer : `Quantum`, optional 

The producer. 

kwds 

Additional keyword arguments used to augment or construct a 

`DataId`. See `DataId` parameters. 

 

Returns 

------- 

ref : `DatasetRef` 

A reference to the stored dataset, updated with the correct id if 

given. 

 

Raises 

------ 

TypeError 

Raised if the butler was not constructed with a Run, and is hence 

read-only. 

""" 

log.debug("Butler put: %s, dataId=%s, producer=%s", datasetRefOrType, dataId, producer) 

if self.run is None: 

raise TypeError("Butler is read-only.") 

datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

if isinstance(datasetRefOrType, DatasetRef) and datasetRefOrType.id is not None: 

raise ValueError("DatasetRef must not be in registry, must have None id") 

 

isVirtualComposite = self.composites.shouldBeDisassembled(datasetType) 

 

# Add Registry Dataset entry. If not a virtual composite, add 

# and attach components at the same time. 

ref = self.registry.addDataset(datasetType, dataId, run=self.run, producer=producer, 

recursive=not isVirtualComposite, **kwds) 

 

# Check to see if this datasetType requires disassembly 

if isVirtualComposite: 

components = datasetType.storageClass.assembler().disassemble(obj) 

for component, info in components.items(): 

compTypeName = datasetType.componentTypeName(component) 

compRef = self.put(info.component, compTypeName, dataId, producer) 

self.registry.attachComponent(component, ref, compRef) 

else: 

# This is an entity without a disassembler. 

self.datastore.put(obj, ref) 

 

return ref 

 

def getDirect(self, ref, parameters=None): 

"""Retrieve a stored dataset. 

 

Unlike `Butler.get`, this method allows datasets outside the Butler's 

collection to be read as long as the `DatasetRef` that identifies them 

can be obtained separately. 

 

Parameters 

---------- 

ref : `DatasetRef` 

Reference to an already stored dataset. 

parameters : `dict` 

Additional StorageClass-defined options to control reading, 

typically used to efficiently read only a subset of the dataset. 

 

Returns 

------- 

obj : `object` 

The dataset. 

""" 

# if the ref exists in the store we return it directly 

if self.datastore.exists(ref): 

return self.datastore.get(ref, parameters=parameters) 

elif ref.isComposite(): 

# Check that we haven't got any unknown parameters 

ref.datasetType.storageClass.validateParameters(parameters) 

# Reconstruct the composite 

usedParams = set() 

components = {} 

for compName, compRef in ref.components.items(): 

# make a dictionary of parameters containing only the subset 

# supported by the StorageClass of the components 

compParams = compRef.datasetType.storageClass.filterParameters(parameters) 

usedParams.update(set(compParams)) 

components[compName] = self.datastore.get(compRef, parameters=compParams) 

 

# Any unused parameters will have to be passed to the assembler 

if parameters: 

unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

else: 

unusedParams = {} 

 

# Assemble the components 

inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

parameters=unusedParams) 

else: 

# single entity in datastore 

raise ValueError("Unable to locate ref {} in datastore {}".format(ref.id, self.datastore.name)) 

 

def get(self, datasetRefOrType, dataId=None, parameters=None, **kwds): 

"""Retrieve a stored dataset. 

 

Parameters 

---------- 

datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

When `DatasetRef` the `dataId` should be `None`. 

Otherwise the `DatasetType` or name thereof. 

dataId : `dict` or `DataId` 

A `dict` of `Dimension` link name, value pairs that label the 

`DatasetRef` within a Collection. When `None`, a `DatasetRef` 

should be provided as the second argument. 

parameters : `dict` 

Additional StorageClass-defined options to control reading, 

typically used to efficiently read only a subset of the dataset. 

kwds 

Additional keyword arguments used to augment or construct a 

`DataId`. See `DataId` parameters. 

 

Returns 

------- 

obj : `object` 

The dataset. 

""" 

log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

if isinstance(datasetRefOrType, DatasetRef): 

idNumber = datasetRefOrType.id 

else: 

idNumber = None 

# Always lookup the DatasetRef, even if one is given, to ensure it is 

# present in the current collection. 

ref = self.registry.find(self.collection, datasetType, dataId, **kwds) 

if ref is None: 

raise LookupError("Dataset {} with data ID {} could not be found in {}".format( 

datasetType.name, dataId, self.collection)) 

if idNumber is not None and idNumber != ref.id: 

raise ValueError("DatasetRef.id does not match id in registry") 

return self.getDirect(ref, parameters=parameters) 

 

def getUri(self, datasetRefOrType, dataId=None, predict=False, **kwds): 

"""Return the URI to the Dataset. 

 

Parameters 

---------- 

datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

When `DatasetRef` the `dataId` should be `None`. 

Otherwise the `DatasetType` or name thereof. 

dataId : `dict` or `DataId` 

A `dict` of `Dimension` link name, value pairs that label the 

`DatasetRef` within a Collection. When `None`, a `DatasetRef` 

should be provided as the second argument. 

predict : `bool` 

If `True`, allow URIs to be returned of datasets that have not 

been written. 

kwds 

Additional keyword arguments used to augment or construct a 

`DataId`. See `DataId` parameters. 

 

Returns 

------- 

uri : `str` 

URI string pointing to the Dataset within the datastore. If the 

Dataset does not exist in the datastore, and if ``predict`` is 

`True`, the URI will be a prediction and will include a URI 

fragment "#predicted". 

If the datastore does not have entities that relate well 

to the concept of a URI the returned URI string will be 

descriptive. The returned URI is not guaranteed to be obtainable. 

 

Raises 

------ 

FileNotFoundError 

A URI has been requested for a dataset that does not exist and 

guessing is not allowed. 

""" 

datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

dataId = DataId(dataId, dimensions=datasetType.dimensions, universe=self.registry.dimensions, **kwds) 

ref = self.registry.find(self.collection, datasetType, dataId) 

if ref is None: 

if predict: 

if self.run is None: 

raise ValueError("Cannot predict location from read-only Butler.") 

ref = DatasetRef(datasetType, dataId, run=self.run) 

else: 

raise FileNotFoundError(f"Dataset {datasetType} {dataId} does not exist in Registry.") 

return self.datastore.getUri(ref, predict) 

 

def datasetExists(self, datasetRefOrType, dataId=None, **kwds): 

"""Return True if the Dataset is actually present in the Datastore. 

 

Parameters 

---------- 

datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` 

When `DatasetRef` the `dataId` should be `None`. 

Otherwise the `DatasetType` or name thereof. 

dataId : `dict` or `DataId` 

A `dict` of `Dimension` link name, value pairs that label the 

`DatasetRef` within a Collection. When `None`, a `DatasetRef` 

should be provided as the second argument. 

kwds 

Additional keyword arguments used to augment or construct a 

`DataId`. See `DataId` parameters. 

 

Raises 

------ 

LookupError 

Raised if the Dataset is not even present in the Registry. 

""" 

datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwds) 

ref = self.registry.find(self.collection, datasetType, dataId, **kwds) 

if ref is None: 

raise LookupError( 

"{} with {} not found in collection {}".format(datasetType, dataId, self.collection) 

) 

return self.datastore.exists(ref)