Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

# This file is part of obs_base. 

# 

# Developed for the LSST Data Management System. 

# This product includes software developed by the LSST Project 

# (https://www.lsst.org). 

# See the COPYRIGHT file at the top-level directory of this distribution 

# for details of code ownership. 

# 

# This program is free software: you can redistribute it and/or modify 

# it under the terms of the GNU General Public License as published by 

# the Free Software Foundation, either version 3 of the License, or 

# (at your option) any later version. 

# 

# This program is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

# GNU General Public License for more details. 

# 

# You should have received a copy of the GNU General Public License 

# along with this program. If not, see <http://www.gnu.org/licenses/>. 

from __future__ import annotations 

 

__all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"] 

 

import os 

import fnmatch 

from dataclasses import dataclass 

from typing import Iterable, Optional, List, Dict 

 

from lsst.utils import doImport 

from lsst.daf.butler import ( 

Butler as Butler3, 

SkyPixDimension 

) 

from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field 

from lsst.pipe.base import Task 

from lsst.skymap import skyMapRegistry, BaseSkyMap 

 

from ..ingest import RawIngestTask 

from .repoConverter import ConversionSubset 

from .rootRepoConverter import RootRepoConverter 

from .calibRepoConverter import CalibRepoConverter 

from .standardRepoConverter import StandardRepoConverter 

 

 

@dataclass 

class ConfiguredSkyMap: 

"""Struct containing information about a skymap that may appear in a Gen2 

repository. 

""" 

 

name: str 

"""Name of the skymap used in Gen3 data IDs. 

""" 

 

sha1: bytes 

"""Hash computed by `BaseSkyMap.getSha1`. 

""" 

 

instance: BaseSkyMap 

"""Name of the skymap used in Gen3 data IDs. 

""" 

 

used: bool = False 

"""Whether this skymap has been found in at least one repository being 

converted. 

""" 

 

 

class ConvertRepoSkyMapConfig(Config): 

"""Sub-config used to hold the parameters of a SkyMap. 

 

Notes 

----- 

This config only needs to exist because we can't put a 

`~lsst.pex.config.RegistryField` directly inside a 

`~lsst.pex.config.ConfigDictField`. 

 

It needs to have its only field named "skyMap" for compatibility with the 

configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can 

use one config file in an obs package to configure both. 

 

This name leads to unfortunate repetition with the field named 

"skymap" that holds it - "skyMap[name].skyMap" - but that seems 

unavoidable. 

""" 

skyMap = skyMapRegistry.makeField( 

doc="Type and parameters for the SkyMap itself.", 

default="dodeca", 

) 

 

 

class ConvertRepoConfig(Config): 

raws = ConfigurableField( 

"Configuration for subtask responsible for ingesting raws and adding " 

"visit and exposure dimension entries.", 

target=RawIngestTask, 

) 

skyMaps = ConfigDictField( 

"Mapping from Gen3 skymap name to the parameters used to construct a " 

"BaseSkyMap instance. This will be used to associated names with " 

"existing skymaps found in the Gen2 repo.", 

keytype=str, 

itemtype=ConvertRepoSkyMapConfig, 

default={} 

) 

collections = DictField( 

"Special collections (values) for certain dataset types (keys). " 

"These are used in addition to rerun collections for datasets in " 

"reruns. The 'raw' dataset must have an entry here if it is to be " 

"converted.", 

keytype=str, 

itemtype=str, 

default={ 

"deepCoadd_skyMap": "skymaps", 

"brightObjectMask": "masks", 

} 

) 

storageClasses = DictField( 

"Mapping from dataset type name or Gen2 policy entry (e.g. 'python' " 

"or 'persistable') to the Gen3 StorageClass name.", 

keytype=str, 

itemtype=str, 

default={ 

"BaseSkyMap": "SkyMap", 

"BaseCatalog": "Catalog", 

"BackgroundList": "Background", 

"raw": "Exposure", 

"MultilevelParquetTable": "DataFrame", 

} 

) 

doRegisterInstrument = Field( 

"If True (default), add dimension records for the Instrument and its " 

"filters and detectors to the registry instead of assuming they are " 

"already present.", 

dtype=bool, 

default=True, 

) 

doWriteCuratedCalibrations = Field( 

"If True (default), ingest human-curated calibrations directly via " 

"the Instrument interface. Note that these calibrations are never " 

"converted from Gen2 repositories.", 

dtype=bool, 

default=True, 

) 

refCats = ListField( 

"The names of reference catalogs (subdirectories under ref_cats) to " 

"be converted", 

dtype=str, 

default=[] 

) 

fileIgnorePatterns = ListField( 

"Filename globs that should be ignored instead of being treated as " 

"datasets.", 

dtype=str, 

default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3"] 

) 

datasetIncludePatterns = ListField( 

"Glob-style patterns for dataset type names that should be converted.", 

dtype=str, 

default=["*"] 

) 

datasetIgnorePatterns = ListField( 

"Glob-style patterns for dataset type names that should not be " 

"converted despite matching a pattern in datasetIncludePatterns.", 

dtype=str, 

default=[] 

) 

ccdKey = Field( 

"Key used for the Gen2 equivalent of 'detector' in data IDs.", 

dtype=str, 

default="ccd", 

) 

relatedOnly = Field( 

"If True (default), only convert datasets that are related to the " 

"ingested visits. Ignored unless a list of visits is passed to " 

"run().", 

dtype=bool, 

default=False, 

) 

 

@property 

def transfer(self): 

return self.raws.transfer 

 

@transfer.setter 

def transfer(self, value): 

self.raws.transfer = value 

 

@property 

def instrument(self): 

return self.raws.instrument 

 

@instrument.setter 

def instrument(self, value): 

self.raws.instrument = value 

 

def setDefaults(self): 

self.transfer = None 

 

# TODO: check that there are no collection overrides for curated 

# calibrations, since we don't have a good way to utilize them. 

 

 

class ConvertRepoTask(Task): 

"""A task that converts one or more related Gen2 data repositories to a 

single Gen3 data repository (with multiple collections). 

 

Parameters 

---------- 

config: `ConvertRepoConfig` 

Configuration for this task. 

butler3: `lsst.daf.butler.Butler` 

Gen3 Butler instance that represents the data repository datasets will 

be ingested into. The collection and/or run associated with this 

Butler will be ignored in favor of collections/runs passed via config 

or to `run`. 

kwds 

Other keyword arguments are forwarded to the `Task` constructor. 

 

Notes 

----- 

Most of the work of converting repositories is delegated to instances of 

the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds 

only state that is relevant for all Gen2 repositories being ingested, while 

each `RepoConverter` instance holds only state relevant for the conversion 

of a single Gen2 repository. Both the task and the `RepoConverter` 

instances are single use; `ConvertRepoTask.run` and most `RepoConverter` 

methods may only be called once on a particular instance. 

""" 

 

ConfigClass = ConvertRepoConfig 

 

_DefaultName = "convertRepo" 

 

def __init__(self, config=None, *, butler3: Butler3, **kwds): 

super().__init__(config, **kwds) 

self.butler3 = butler3 

self.registry = self.butler3.registry 

self.universe = self.registry.dimensions 

if self.isDatasetTypeIncluded("raw"): 

self.makeSubtask("raws", butler=butler3) 

self.instrument = self.raws.instrument 

else: 

self.raws = None 

self.instrument = doImport(self.config.instrument)() 

self._configuredSkyMapsBySha1 = {} 

self._configuredSkyMapsByName = {} 

for name, config in self.config.skyMaps.items(): 

instance = config.skyMap.apply() 

struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance) 

self._configuredSkyMapsBySha1[struct.sha1] = struct 

self._configuredSkyMapsByName[struct.name] = struct 

self._usedSkyPix = set() 

 

def isDatasetTypeIncluded(self, datasetTypeName: str): 

"""Return `True` if configuration indicates that the given dataset type 

should be converted. 

 

This method is intended to be called primarily by the 

`RepoConverter` instances used interally by the task. 

 

Parameters 

---------- 

datasetTypeName: str 

Name of the dataset type. 

 

Returns 

------- 

included : `bool` 

Whether the dataset should be included in the conversion. 

""" 

return ( 

any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

for pattern in self.config.datasetIncludePatterns) and 

not any(fnmatch.fnmatchcase(datasetTypeName, pattern) 

for pattern in self.config.datasetIgnorePatterns) 

) 

 

def useSkyMap(self, skyMap: BaseSkyMap) -> str: 

"""Indicate that a repository uses the given SkyMap. 

 

This method is intended to be called primarily by the 

`RepoConverter` instances used interally by the task. 

 

Parameters 

---------- 

skyMap : `lsst.skymap.BaseSkyMap` 

SkyMap instance being used, typically retrieved from a Gen2 

data repository. 

 

Returns 

------- 

name : `str` 

The name of the skymap in Gen3 data IDs. 

""" 

sha1 = skyMap.getSha1() 

try: 

struct = self._configuredSkyMapsBySha1[sha1] 

except KeyError as err: 

raise LookupError(f"SkyMap with sha1={sha1} not included in configuration.") from err 

struct.used = True 

return struct.name 

 

def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]): 

"""Register all skymaps that have been marked as used. 

 

This method is intended to be called primarily by the 

`RepoConverter` instances used interally by the task. 

 

Parameters 

---------- 

subset : `ConversionSubset`, optional 

Object that will be used to filter converted datasets by data ID. 

If given, it will be updated with the tracts of this skymap that 

overlap the visits in the subset. 

""" 

for struct in self._configuredSkyMapsBySha1.values(): 

if struct.used: 

struct.instance.register(struct.name, self.registry) 

if subset is not None and self.config.relatedOnly: 

subset.addSkyMap(self.registry, struct.name) 

 

def useSkyPix(self, dimension: SkyPixDimension): 

"""Indicate that a repository uses the given SkyPix dimension. 

 

This method is intended to be called primarily by the 

`RepoConverter` instances used interally by the task. 

 

Parameters 

---------- 

dimension : `lsst.daf.butler.SkyPixDimension` 

Dimension represening a pixelization of the sky. 

""" 

self._usedSkyPix.add(dimension) 

 

def registerUsedSkyPix(self, subset: Optional[ConversionSubset]): 

"""Register all skymaps that have been marked as used. 

 

This method is intended to be called primarily by the 

`RepoConverter` instances used interally by the task. 

 

Parameters 

---------- 

subset : `ConversionSubset`, optional 

Object that will be used to filter converted datasets by data ID. 

If given, it will be updated with the pixelization IDs that 

overlap the visits in the subset. 

""" 

if subset is not None and self.config.relatedOnly: 

for dimension in self._usedSkyPix: 

subset.addSkyPix(self.registry, dimension) 

 

def run(self, root: str, collections: List[str], *, 

calibs: Dict[str, List[str]] = None, 

reruns: Dict[str, List[str]] = None, 

visits: Optional[Iterable[int]] = None): 

"""Convert a group of related data repositories. 

 

Parameters 

---------- 

root : `str` 

Complete path to the root Gen2 data repository. This should be 

a data repository that includes a Gen2 registry and any raw files 

and/or reference catalogs. 

collections : `list` of `str` 

Gen3 collections that datasets from the root repository should be 

associated with. This should include any rerun collection that 

these datasets should also be considered to be part of; because of 

structural difference between Gen2 parent/child relationships and 

Gen3 collections, these cannot be reliably inferred. 

calibs : `dict` 

Dictionary mapping calibration repository path to the collections 

that the repository's datasets should be associated with. The path 

may be relative to ``root`` or absolute. Collections should 

include child repository collections as appropriate (see 

documentation for ``collections``). 

reruns : `dict` 

Dictionary mapping rerun repository path to the collections that 

the repository's datasets should be associated with. The path may 

be relative to ``root`` or absolute. Collections should include 

child repository collections as appropriate (see documentation for 

``collections``). 

visits : iterable of `int`, optional 

The integer IDs of visits to convert. If not provided, all visits 

in the Gen2 root repository will be converted. 

""" 

 

if calibs is None: 

calibs = {} 

if reruns is None: 

reruns = {} 

if visits is not None: 

subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits)) 

else: 

if self.config.relatedOnly: 

self.log.warn("config.relatedOnly is True but all visits are being ingested; " 

"no filtering will be done.") 

subset = None 

 

if self.config.doRegisterInstrument: 

self.instrument.register(self.registry) 

 

# Make and prep converters for all Gen2 repos. This should not modify 

# the Registry database or filesystem at all, though it may query it. 

converters = [] 

rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset) 

rootConverter.prep() 

converters.append(rootConverter) 

 

for root, collections in calibs.items(): 

if not os.path.isabs(root): 

root = os.path.join(rootConverter.root, root) 

converter = CalibRepoConverter(task=self, root=root, collections=collections, 

mapper=rootConverter.mapper, 

subset=rootConverter.subset) 

converter.prep() 

converters.append(converter) 

 

for root, collections in reruns.items(): 

if not os.path.isabs(root): 

root = os.path.join(rootConverter.root, root) 

converter = StandardRepoConverter(task=self, root=root, collections=collections, 

subset=rootConverter.subset) 

converter.prep() 

converters.append(converter) 

 

# Actual database writes start here. We can't wrap these sanely in 

# transactions (yet) because we keep initializing new Butler instances 

# just so we can write into new runs/collections, and transactions 

# are managed at the Butler level (DM-21246 should let us fix this). 

 

# Insert dimensions needed by any converters. These are only the 

# dimensions that a converter expects to be uniquely derived from the 

# Gen2 repository it is reponsible for - e.g. visits, exposures, and 

# calibration_labels. 

# 

# Note that we do not try to filter dimensions down to just those 

# related to the given visits, even if config.relatedOnly is True; we 

# need them in the Gen3 repo in order to be able to know which datasets 

# to convert, because Gen2 alone doesn't know enough about the 

# relationships between data IDs. 

for converter in converters: 

converter.insertDimensionData() 

 

# Insert dimensions that are potentially shared by all Gen2 

# repositories (and are hence managed directly by the Task, rather 

# than a converter instance). 

# This also finishes setting up the (shared) converter.subsets object 

# that is used to filter data IDs for config.relatedOnly. 

self.registerUsedSkyMaps(rootConverter.subset) 

self.registerUsedSkyPix(rootConverter.subset) 

 

# Actually ingest datasets. 

for converter in converters: 

converter.ingest()