Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

# This file is part of daf_butler. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (http://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

 

""" 

Butler top level classes. 

""" 

 

import os 

import contextlib 

import logging 

 

from lsst.utils import doImport 

from .core.utils import transactional 

from .core.datasets import DatasetRef, DatasetType 

from .core.datastore import Datastore 

from .core.registry import Registry 

from .core.run import Run 

from .core.storageClass import StorageClassFactory 

from .core.config import Config, ConfigSubset 

from .core.butlerConfig import ButlerConfig 

from .core.composites import CompositesMap 

 

 

__all__ = ("Butler",) 

 

log = logging.getLogger(__name__) 

 

 

class Butler: 

"""Main entry point for the data access system. 

 

Attributes 

---------- 

config : `str`, `ButlerConfig` or `Config`, optional 

(filename to) configuration. If this is not a `ButlerConfig`, defaults 

will be read. If a `str`, may be the path to a directory containing 

a "butler.yaml" file. 

datastore : `Datastore` 

Datastore to use for storage. 

registry : `Registry` 

Registry to use for lookups. 

 

Parameters 

---------- 

config : `Config` 

Configuration. 

collection : `str`, optional 

Collection to use for all input lookups, overriding 

config["collection"] if provided. 

run : `str`, `Run`, optional 

Collection associated with the `Run` to use for outputs, overriding 

config["run"]. If a `Run` associated with the given Collection does 

not exist, it will be created. If "collection" is None, this 

collection will be used for input lookups as well; if not, it must have 

the same value as "run". 

 

Raises 

------ 

ValueError 

Raised if neither "collection" nor "run" are provided by argument or 

config, or if both are provided and are inconsistent. 

""" 

 

@staticmethod 

def makeRepo(root, config=None, standalone=False, createRegistry=True): 

"""Create an empty data repository by adding a butler.yaml config 

to a repository root directory. 

 

Parameters 

---------- 

root : `str` 

Filesystem path to the root of the new repository. Will be created 

if it does not exist. 

config : `Config`, optional 

Configuration to write to the repository, after setting any 

root-dependent Registry or Datastore config options. If `None`, 

default configuration will be used. 

standalone : `bool` 

If True, write all expanded defaults, not just customized or 

repository-specific settings. 

This (mostly) decouples the repository from the default 

configuration, insulating it from changes to the defaults (which 

may be good or bad, depending on the nature of the changes). 

Future *additions* to the defaults will still be picked up when 

initializing `Butlers` to repos created with ``standalone=True``. 

createRegistry : `bool` 

If `True` create a new Registry. 

 

Note that when ``standalone=False`` (the default), the configuration 

search path (see `ConfigSubset.defaultSearchPaths`) that was used to 

construct the repository should also be used to construct any Butlers 

to it to avoid configuration inconsistencies. 

 

Returns 

------- 

config : `Config` 

The updated `Config` instance written to the repo. 

 

Raises 

------ 

ValueError 

Raised if a ButlerConfig or ConfigSubset is passed instead of a 

regular Config (as these subclasses would make it impossible to 

support ``standalone=False``). 

os.error 

Raised if the directory does not exist, exists but is not a 

directory, or cannot be created. 

""" 

if isinstance(config, (ButlerConfig, ConfigSubset)): 

raise ValueError("makeRepo must be passed a regular Config without defaults applied.") 

root = os.path.abspath(root) 

if not os.path.isdir(root): 

os.makedirs(root) 

config = Config(config) 

full = ButlerConfig(config) # this applies defaults 

datastoreClass = doImport(full["datastore", "cls"]) 

datastoreClass.setConfigRoot(root, config, full) 

registryClass = doImport(full["registry", "cls"]) 

registryClass.setConfigRoot(root, config, full) 

if standalone: 

config.merge(full) 

config.dumpToFile(os.path.join(root, "butler.yaml")) 

# Create Registry and populate tables 

registryClass.fromConfig(config, create=createRegistry) 

return config 

 

def __init__(self, config=None, collection=None, run=None): 

self.config = ButlerConfig(config) 

self.registry = Registry.fromConfig(self.config) 

self.datastore = Datastore.fromConfig(self.config, self.registry) 

self.storageClasses = StorageClassFactory() 

self.storageClasses.addFromConfig(self.config) 

self.composites = CompositesMap(self.config) 

if run is None: 

runCollection = self.config.get("run", None) 

self.run = None 

else: 

if isinstance(run, Run): 

self.run = run 

runCollection = self.run.collection 

else: 

runCollection = run 

self.run = None 

# if run *arg* is not None and collection arg is, use run for collecion. 

if collection is None: 

collection = runCollection 

del run # it's a logic bug if we try to use this variable below 

if collection is None: # didn't get a collection from collection or run *args* 

collection = self.config.get("collection", None) 

if collection is None: # didn't get a collection from config["collection"] 

collection = runCollection # get collection from run found in config 

if collection is None: 

raise ValueError("No run or collection provided.") 

if runCollection is not None and collection != runCollection: 

raise ValueError( 

"Run ({}) and collection ({}) are inconsistent.".format(runCollection, collection) 

) 

self.collection = collection 

if runCollection is not None and self.run is None: 

self.run = self.registry.getRun(collection=runCollection) 

if self.run is None: 

self.run = self.registry.makeRun(runCollection) 

 

def __reduce__(self): 

"""Support pickling. 

""" 

return (Butler, (self.config, )) 

 

def __str__(self): 

return "Butler(collection='{}', datastore='{}', registry='{}')".format( 

self.collection, self.datastore, self.registry) 

 

@contextlib.contextmanager 

def transaction(self): 

"""Context manager supporting `Butler` transactions. 

 

Transactions can be nested. 

""" 

with self.registry.transaction(): 

with self.datastore.transaction(): 

yield 

 

@transactional 

def put(self, obj, datasetRefOrType, dataId=None, producer=None): 

"""Store and register a dataset. 

 

Parameters 

---------- 

obj : `object` 

The dataset. 

datasetRefOrType : `DatasetRef`, `DatasetType` instance or `str` 

When `DatasetRef` the `dataId` should be `None`. 

Otherwise the `DatasetType` or name thereof. 

dataId : `dict`, optional 

An identifier with `DataUnit` names and values. 

When `None` a `DatasetRef` should be supplied as the second 

argument. 

producer : `Quantum`, optional 

The producer. 

 

Returns 

------- 

ref : `DatasetRef` 

A reference to the stored dataset, updated with the correct id if 

given. 

 

Raises 

------ 

TypeError 

Raised if the butler was not constructed with a Run, and is hence 

read-only. 

""" 

log.debug("Butler put: %s, dataId=%s, producer=%s", datasetRefOrType, dataId, producer) 

if self.run is None: 

raise TypeError("Butler is read-only.") 

if isinstance(datasetRefOrType, DatasetRef): 

if dataId is not None: 

raise ValueError("DatasetRef given, cannot use dataId as well") 

if datasetRefOrType.id is not None: 

raise ValueError("DatasetRef must not be in registry, must have None id") 

dataId = datasetRefOrType.dataId 

datasetType = datasetRefOrType.datasetType 

else: 

if dataId is None: 

raise ValueError("Must provide a dataId if first argument is not a DatasetRef") 

if isinstance(datasetRefOrType, DatasetType): 

datasetType = datasetRefOrType 

else: 

datasetType = self.registry.getDatasetType(datasetRefOrType) 

 

isVirtualComposite = self.composites.shouldBeDisassembled(datasetType) 

 

# Add Registry Dataset entry. If not a virtual composite, add 

# and attach components at the same time. 

ref = self.registry.addDataset(datasetType, dataId, run=self.run, producer=producer, 

recursive=not isVirtualComposite) 

 

# Check to see if this datasetType requires disassembly 

if isVirtualComposite: 

components = datasetType.storageClass.assembler().disassemble(obj) 

for component, info in components.items(): 

compTypeName = datasetType.componentTypeName(component) 

compRef = self.put(info.component, compTypeName, dataId, producer) 

self.registry.attachComponent(component, ref, compRef) 

else: 

# This is an entity without a disassembler. 

self.datastore.put(obj, ref) 

 

return ref 

 

def getDirect(self, ref, parameters=None): 

"""Retrieve a stored dataset. 

 

Unlike `Butler.get`, this method allows datasets outside the Butler's 

collection to be read as long as the `DatasetRef` that identifies them 

can be obtained separately. 

 

Parameters 

---------- 

ref : `DatasetRef` 

Reference to an already stored dataset. 

parameters : `dict` 

Additional StorageClass-defined options to control reading, 

typically used to efficiently read only a subset of the dataset. 

 

Returns 

------- 

obj : `object` 

The dataset. 

""" 

# if the ref exists in the store we return it directly 

if self.datastore.exists(ref): 

return self.datastore.get(ref, parameters=parameters) 

elif ref.isComposite(): 

# Check that we haven't got any unknown parameters 

ref.datasetType.storageClass.validateParameters(parameters) 

# Reconstruct the composite 

usedParams = set() 

components = {} 

for compName, compRef in ref.components.items(): 

# make a dictionary of parameters containing only the subset 

# supported by the StorageClass of the components 

compParams = compRef.datasetType.storageClass.filterParameters(parameters) 

usedParams.update(set(compParams)) 

components[compName] = self.datastore.get(compRef, parameters=compParams) 

 

# Any unused parameters will have to be passed to the assembler 

if parameters: 

unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} 

else: 

unusedParams = {} 

 

# Assemble the components 

inMemoryDataset = ref.datasetType.storageClass.assembler().assemble(components) 

return ref.datasetType.storageClass.assembler().handleParameters(inMemoryDataset, 

parameters=unusedParams) 

else: 

# single entity in datastore 

raise ValueError("Unable to locate ref {} in datastore {}".format(ref.id, self.datastore.name)) 

 

def get(self, datasetRefOrType, dataId=None, parameters=None): 

"""Retrieve a stored dataset. 

 

Parameters 

---------- 

datasetRefOrType : `DatasetRef`, `DatasetType` instance or `str` 

When `DatasetRef` the `dataId` should be `None`. 

Otherwise the `DatasetType` or name thereof. 

dataId : `dict` 

A `dict` of `DataUnit` link name, value pairs that label the `DatasetRef` 

within a Collection. 

When `None` a `DatasetRef` should be supplied as the second 

argument. 

parameters : `dict` 

Additional StorageClass-defined options to control reading, 

typically used to efficiently read only a subset of the dataset. 

 

Returns 

------- 

obj : `object` 

The dataset. 

""" 

log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) 

if isinstance(datasetRefOrType, DatasetRef): 

if dataId is not None: 

raise ValueError("DatasetRef given, cannot use dataId as well") 

datasetType = datasetRefOrType.datasetType 

dataId = datasetRefOrType.dataId 

idNumber = datasetRefOrType.id 

else: 

datasetType = self.registry.getDatasetType(datasetRefOrType) 

idNumber = None 

# Always lookup the DatasetRef, even if one is given, to ensure it is 

# present in the current collection. 

ref = self.registry.find(self.collection, datasetType, dataId) 

if ref is None: 

raise LookupError("Dataset {} with data ID {} could not be found in {}".format( 

datasetType.name, dataId, self.collection)) 

if idNumber is not None and idNumber != ref.id: 

raise ValueError("DatasetRef.id does not match id in registry") 

return self.getDirect(ref, parameters=parameters) 

 

def getUri(self, datasetType, dataId, predict=False): 

"""Return the URI to the Dataset. 

 

Parameters 

---------- 

datasetType : `DatasetType` instance or `str` 

The `DatasetType`. 

dataId : `dict` 

A `dict` of `DataUnit` link name, value pairs that label the 

`DatasetRef` within a Collection. 

predict : `bool` 

If `True`, allow URIs to be returned of datasets that have not 

been written. 

 

Returns 

------- 

uri : `str` 

URI string pointing to the Dataset within the datastore. If the 

Dataset does not exist in the datastore, and if ``predict`` is 

`True`, the URI will be a prediction and will include a URI 

fragment "#predicted". 

If the datastore does not have entities that relate well 

to the concept of a URI the returned URI string will be 

descriptive. The returned URI is not guaranteed to be obtainable. 

 

Raises 

------ 

FileNotFoundError 

A URI has been requested for a dataset that does not exist and 

guessing is not allowed. 

""" 

datasetType = self.registry.getDatasetType(datasetType) 

ref = self.registry.find(self.collection, datasetType, dataId) 

return self.datastore.getUri(ref, predict) 

 

def datasetExists(self, datasetType, dataId): 

"""Return True if the Dataset is actually present in the Datastore. 

 

Parameters 

---------- 

datasetType : `DatasetType` instance or `str` 

The `DatasetType`. 

dataId : `dict` 

A `dict` of `DataUnit` link name, value pairs that label the 

`DatasetRef` within a Collection. 

 

Raises 

------ 

LookupError 

Raised if the Dataset is not even present in the Registry. 

""" 

datasetType = self.registry.getDatasetType(datasetType) 

ref = self.registry.find(self.collection, datasetType, dataId) 

if ref is None: 

raise LookupError( 

"{} with {} not found in collection {}".format(datasetType, dataId, self.collection) 

) 

return self.datastore.exists(ref)