Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of obs_base. 

2# 

3# Developed for the LSST Data Management System. 

4# This product includes software developed by the LSST Project 

5# (https://www.lsst.org). 

6# See the COPYRIGHT file at the top-level directory of this distribution 

7# for details of code ownership. 

8# 

9# This program is free software: you can redistribute it and/or modify 

10# it under the terms of the GNU General Public License as published by 

11# the Free Software Foundation, either version 3 of the License, or 

12# (at your option) any later version. 

13# 

14# This program is distributed in the hope that it will be useful, 

15# but WITHOUT ANY WARRANTY; without even the implied warranty of 

16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

17# GNU General Public License for more details. 

18# 

19# You should have received a copy of the GNU General Public License 

20# along with this program. If not, see <http://www.gnu.org/licenses/>. 

21 

22 

23__all__ = ("RawIngestTask", "RawIngestConfig", "makeTransferChoiceField") 

24 

25import json 

26import re 

27from dataclasses import dataclass, InitVar 

28from typing import Callable, List, Iterator, Iterable, Tuple, Type, Optional, Any, Union 

29from collections import defaultdict 

30from multiprocessing import Pool 

31 

32from astro_metadata_translator import ObservationInfo, merge_headers 

33from astro_metadata_translator.indexing import process_sidecar_data, process_index_data 

34from lsst.afw.fits import readMetadata 

35from lsst.daf.butler import ( 

36 Butler, 

37 ButlerURI, 

38 CollectionType, 

39 DataCoordinate, 

40 DatasetRef, 

41 DatasetType, 

42 DimensionRecord, 

43 DimensionUniverse, 

44 FileDataset, 

45 Formatter, 

46 Progress, 

47) 

48from lsst.pex.config import Config, ChoiceField, Field 

49from lsst.pipe.base import Task, timeMethod 

50 

51from ._instrument import Instrument, makeExposureRecordFromObsInfo 

52from ._fitsRawFormatterBase import FitsRawFormatterBase 

53 

54 

55def _do_nothing(*args, **kwargs) -> None: 

56 """Do nothing. 

57 

58 This is a function that accepts anything and does nothing. 

59 For use as a default in callback arguments. 

60 """ 

61 pass 

62 

63 

64def _log_msg_counter(noun: Union[int, Iterable]) -> Tuple[int, str]: 

65 """Count the iterable and return the count and plural modifier. 

66 

67 Parameters 

68 ---------- 

69 noun : Iterable or `int` 

70 Thing to count. If given an integer it is assumed to be the count 

71 to use to calculate modifier. 

72 

73 Returns 

74 ------- 

75 num : `int` 

76 Number of items found in ``noun``. 

77 modifier : `str` 

78 Character to add to the end of a string referring to these items 

79 to indicate whether it was a single item or not. Returns empty 

80 string if there is one item or "s" otherwise. 

81 

82 Examples 

83 -------- 

84 

85 .. code-block:: python 

86 

87 log.warning("Found %d file%s", *_log_msg_counter(nfiles)) 

88 """ 

89 if isinstance(noun, int): 

90 num = noun 

91 else: 

92 num = len(noun) 

93 return num, "" if num == 1 else "s" 

94 

95 

96@dataclass 

97class RawFileDatasetInfo: 

98 """Information about a single dataset within a raw file.""" 

99 

100 dataId: DataCoordinate 

101 """Data ID for this file (`lsst.daf.butler.DataCoordinate`).""" 

102 

103 obsInfo: ObservationInfo 

104 """Standardized observation metadata extracted directly from the file 

105 headers (`astro_metadata_translator.ObservationInfo`). 

106 """ 

107 

108 

109@dataclass 

110class RawFileData: 

111 """Information about a single raw file, used during ingest.""" 

112 

113 datasets: List[RawFileDatasetInfo] 

114 """The information describing each dataset within this raw file. 

115 (`list` of `RawFileDatasetInfo`) 

116 """ 

117 

118 filename: ButlerURI 

119 """URI of the file this information was extracted from (`str`). 

120 

121 This is the path prior to ingest, not the path after ingest. 

122 """ 

123 

124 FormatterClass: Type[FitsRawFormatterBase] 

125 """Formatter class that should be used to ingest this file (`type`; as 

126 subclass of `FitsRawFormatterBase`). 

127 """ 

128 

129 instrumentClass: Optional[Type[Instrument]] 

130 """The `Instrument` class associated with this file. Can be `None` 

131 if ``datasets`` is an empty list.""" 

132 

133 

134@dataclass 

135class RawExposureData: 

136 """Information about a complete raw exposure, used during ingest.""" 

137 

138 dataId: DataCoordinate 

139 """Data ID for this exposure (`lsst.daf.butler.DataCoordinate`). 

140 """ 

141 

142 files: List[RawFileData] 

143 """List of structures containing file-level information. 

144 """ 

145 

146 universe: InitVar[DimensionUniverse] 

147 """Set of all known dimensions. 

148 """ 

149 

150 record: Optional[DimensionRecord] = None 

151 """The exposure `DimensionRecord` that must be inserted into the 

152 `~lsst.daf.butler.Registry` prior to file-level ingest (`DimensionRecord`). 

153 """ 

154 

155 def __post_init__(self, universe: DimensionUniverse): 

156 # We don't care which file or dataset we read metadata from, because 

157 # we're assuming they'll all be the same; just use the first ones. 

158 self.record = makeExposureRecordFromObsInfo(self.files[0].datasets[0].obsInfo, universe) 

159 

160 

161def makeTransferChoiceField(doc="How to transfer files (None for no transfer).", default="auto"): 

162 """Create a Config field with options for transferring data between repos. 

163 

164 The allowed options for the field are exactly those supported by 

165 `lsst.daf.butler.Datastore.ingest`. 

166 

167 Parameters 

168 ---------- 

169 doc : `str` 

170 Documentation for the configuration field. 

171 

172 Returns 

173 ------- 

174 field : `lsst.pex.config.ChoiceField` 

175 Configuration field. 

176 """ 

177 return ChoiceField( 

178 doc=doc, 

179 dtype=str, 

180 allowed={"move": "move", 

181 "copy": "copy", 

182 "auto": "choice will depend on datastore", 

183 "direct": "use URI to ingested file directly in datastore", 

184 "link": "hard link falling back to symbolic link", 

185 "hardlink": "hard link", 

186 "symlink": "symbolic (soft) link", 

187 "relsymlink": "relative symbolic link", 

188 }, 

189 optional=True, 

190 default=default 

191 ) 

192 

193 

194class RawIngestConfig(Config): 

195 """Configuration class for RawIngestTask.""" 

196 

197 transfer = makeTransferChoiceField() 

198 failFast = Field( 

199 dtype=bool, 

200 default=False, 

201 doc="If True, stop ingest as soon as any problem is encountered with any file. " 

202 "Otherwise problems files will be skipped and logged and a report issued at completion.", 

203 ) 

204 

205 

206class RawIngestTask(Task): 

207 """Driver Task for ingesting raw data into Gen3 Butler repositories. 

208 

209 Parameters 

210 ---------- 

211 config : `RawIngestConfig` 

212 Configuration for the task. 

213 butler : `~lsst.daf.butler.Butler` 

214 Writeable butler instance, with ``butler.run`` set to the appropriate 

215 `~lsst.daf.butler.CollectionType.RUN` collection for these raw 

216 datasets. 

217 on_success : `Callable`, optional 

218 A callback invoked when all of the raws associated with an exposure 

219 are ingested. Will be passed a list of `FileDataset` objects, each 

220 containing one or more resolved `DatasetRef` objects. If this callback 

221 raises it will interrupt the entire ingest process, even if 

222 `RawIngestConfig.failFast` is `False`. 

223 on_metadata_failure : `Callable`, optional 

224 A callback invoked when a failure occurs trying to translate the 

225 metadata for a file. Will be passed the URI and the exception, in 

226 that order, as positional arguments. Guaranteed to be called in an 

227 ``except`` block, allowing the callback to re-raise or replace (with 

228 ``raise ... from``) to override the task's usual error handling (before 

229 `RawIngestConfig.failFast` logic occurs). 

230 on_ingest_failure : `Callable`, optional 

231 A callback invoked when dimension record or dataset insertion into the 

232 database fails for an exposure. Will be passed a `RawExposureData` 

233 instance and the exception, in that order, as positional arguments. 

234 Guaranteed to be called in an ``except`` block, allowing the callback 

235 to re-raise or replace (with ``raise ... from``) to override the task's 

236 usual error handling (before `RawIngestConfig.failFast` logic occurs). 

237 **kwargs 

238 Additional keyword arguments are forwarded to the `lsst.pipe.base.Task` 

239 constructor. 

240 

241 Notes 

242 ----- 

243 Each instance of `RawIngestTask` writes to the same Butler. Each 

244 invocation of `RawIngestTask.run` ingests a list of files. 

245 """ 

246 

247 ConfigClass = RawIngestConfig 

248 

249 _DefaultName = "ingest" 

250 

251 def getDatasetType(self): 

252 """Return the DatasetType of the datasets ingested by this Task.""" 

253 return DatasetType("raw", ("instrument", "detector", "exposure"), "Exposure", 

254 universe=self.butler.registry.dimensions) 

255 

256 def __init__(self, config: Optional[RawIngestConfig] = None, *, butler: Butler, 

257 on_success: Callable[[List[FileDataset]], Any] = _do_nothing, 

258 on_metadata_failure: Callable[[str, Exception], Any] = _do_nothing, 

259 on_ingest_failure: Callable[[RawExposureData, Exception], Any] = _do_nothing, 

260 **kwargs: Any): 

261 config.validate() # Not a CmdlineTask nor PipelineTask, so have to validate the config here. 

262 super().__init__(config, **kwargs) 

263 self.butler = butler 

264 self.universe = self.butler.registry.dimensions 

265 self.datasetType = self.getDatasetType() 

266 self._on_success = on_success 

267 self._on_metadata_failure = on_metadata_failure 

268 self._on_ingest_failure = on_ingest_failure 

269 self.progress = Progress("obs.base.RawIngestTask") 

270 

271 # Import all the instrument classes so that we ensure that we 

272 # have all the relevant metadata translators loaded. 

273 Instrument.importAll(self.butler.registry) 

274 

275 def _reduce_kwargs(self): 

276 # Add extra parameters to pickle. 

277 return dict(**super()._reduce_kwargs(), butler=self.butler, on_success=self._on_success, 

278 on_metadata_failure=self._on_metadata_failure, on_ingest_failure=self._on_ingest_failure) 

279 

280 def _determine_instrument_formatter(self, dataId, filename): 

281 """Determine the instrument and formatter class. 

282 

283 Parameters 

284 ---------- 

285 dataId : `lsst.daf.butler.DataCoordinate` 

286 The dataId associated with this dataset. 

287 filename : `ButlerURI` 

288 URI of file used for error reporting. 

289 

290 Returns 

291 ------- 

292 instrument : `Instrument` or `None` 

293 Instance of the `Instrument` associated with this dataset. `None` 

294 indicates that the instrument could not be determined. 

295 formatterClass : `type` 

296 Class to be used as the formatter for this dataset. 

297 """ 

298 # The data model currently assumes that whilst multiple datasets 

299 # can be associated with a single file, they must all share the 

300 # same formatter. 

301 try: 

302 instrument = Instrument.fromName(dataId["instrument"], self.butler.registry) 

303 except LookupError as e: 

304 self._on_metadata_failure(filename, e) 

305 self.log.warning("Instrument %s for file %s not known to registry", 

306 dataId["instrument"], filename) 

307 if self.config.failFast: 

308 raise RuntimeError(f"Instrument {dataId['instrument']} for" 

309 f" file {filename} not known to registry") from e 

310 FormatterClass = Formatter 

311 # Indicate that we could not work out the instrument. 

312 instrument = None 

313 else: 

314 FormatterClass = instrument.getRawFormatter(dataId) 

315 return instrument, FormatterClass 

316 

317 def extractMetadata(self, filename: ButlerURI) -> RawFileData: 

318 """Extract and process metadata from a single raw file. 

319 

320 Parameters 

321 ---------- 

322 filename : `ButlerURI` 

323 URI to the file. 

324 

325 Returns 

326 ------- 

327 data : `RawFileData` 

328 A structure containing the metadata extracted from the file, 

329 as well as the original filename. All fields will be populated, 

330 but the `RawFileData.dataId` attribute will be a minimal 

331 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. The 

332 ``instrumentClass`` field will be `None` if there is a problem 

333 with metadata extraction. 

334 

335 Notes 

336 ----- 

337 Assumes that there is a single dataset associated with the given 

338 file. Instruments using a single file to store multiple datasets 

339 must implement their own version of this method. 

340 

341 By default the method will catch all exceptions unless the ``failFast`` 

342 configuration item is `True`. If an error is encountered the 

343 `_on_metadata_failure()` method will be called. If no exceptions 

344 result and an error was encountered the returned object will have 

345 a null-instrument class and no datasets. 

346 

347 This method supports sidecar JSON files which can be used to 

348 extract metadata without having to read the data file itself. 

349 The sidecar file is always used if found. 

350 """ 

351 sidecar_fail_msg = "" # Requires prepended space when set. 

352 try: 

353 sidecar_file = filename.updatedExtension(".json") 

354 if sidecar_file.exists(): 

355 content = json.loads(sidecar_file.read()) 

356 header = process_sidecar_data(content) 

357 sidecar_fail_msg = " (via sidecar)" 

358 else: 

359 # Read the metadata from the data file itself. 

360 # Manually merge the primary and "first data" headers here 

361 # because we do not know in general if an input file has 

362 # set INHERIT=T. 

363 # For remote files download the entire file to get the 

364 # header. This is very inefficient and it would be better 

365 # to have some way of knowing where in the file the headers 

366 # are and to only download those parts of the file. 

367 with filename.as_local() as local_file: 

368 phdu = readMetadata(local_file.ospath, 0) 

369 header = merge_headers([phdu, readMetadata(local_file.ospath)], mode="overwrite") 

370 datasets = [self._calculate_dataset_info(header, filename)] 

371 except Exception as e: 

372 self.log.debug("Problem extracting metadata from %s%s: %s", filename, sidecar_fail_msg, e) 

373 # Indicate to the caller that we failed to read. 

374 datasets = [] 

375 formatterClass = Formatter 

376 instrument = None 

377 self._on_metadata_failure(filename, e) 

378 if self.config.failFast: 

379 raise RuntimeError("Problem extracting metadata for file " 

380 f"{filename}{sidecar_fail_msg}") from e 

381 else: 

382 self.log.debug("Extracted metadata for file %s%s", filename, sidecar_fail_msg) 

383 # The data model currently assumes that whilst multiple datasets 

384 # can be associated with a single file, they must all share the 

385 # same formatter. 

386 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, filename) 

387 if instrument is None: 

388 datasets = [] 

389 

390 return RawFileData(datasets=datasets, filename=filename, 

391 FormatterClass=formatterClass, 

392 instrumentClass=instrument) 

393 

394 def _calculate_dataset_info(self, header, filename): 

395 """Calculate a RawFileDatasetInfo from the supplied information. 

396 

397 Parameters 

398 ---------- 

399 header : Mapping or `astro_metadata_translator.ObservationInfo` 

400 Header from the dataset or previously-translated content. 

401 filename : `ButlerURI` 

402 Filename to use for error messages. 

403 

404 Returns 

405 ------- 

406 dataset : `RawFileDatasetInfo` 

407 The dataId, and observation information associated with this 

408 dataset. 

409 """ 

410 # To ensure we aren't slowed down for no reason, explicitly 

411 # list here the properties we need for the schema. 

412 # Use a dict with values a boolean where True indicates 

413 # that it is required that we calculate this property. 

414 ingest_subset = { 

415 "altaz_begin": False, 

416 "boresight_rotation_coord": False, 

417 "boresight_rotation_angle": False, 

418 "dark_time": False, 

419 "datetime_begin": True, 

420 "datetime_end": True, 

421 "detector_num": True, 

422 "exposure_group": False, 

423 "exposure_id": True, 

424 "exposure_time": True, 

425 "instrument": True, 

426 "tracking_radec": False, 

427 "object": False, 

428 "observation_counter": False, 

429 "observation_id": True, 

430 "observation_reason": False, 

431 "observation_type": True, 

432 "observing_day": False, 

433 "physical_filter": True, 

434 "science_program": False, 

435 "visit_id": False, 

436 } 

437 

438 if isinstance(header, ObservationInfo): 

439 obsInfo = header 

440 missing = [] 

441 # Need to check the required properties are present. 

442 for property, required in ingest_subset.items(): 

443 if not required: 

444 continue 

445 # getattr does not need to be protected because it is using 

446 # the defined list above containing properties that must exist. 

447 value = getattr(obsInfo, property) 

448 if value is None: 

449 missing.append(property) 

450 if missing: 

451 raise ValueError(f"Requested required properties are missing from file {filename}:" 

452 f" {missing} (via JSON)") 

453 

454 else: 

455 obsInfo = ObservationInfo(header, pedantic=False, filename=str(filename), 

456 required={k for k in ingest_subset if ingest_subset[k]}, 

457 subset=set(ingest_subset)) 

458 

459 dataId = DataCoordinate.standardize(instrument=obsInfo.instrument, 

460 exposure=obsInfo.exposure_id, 

461 detector=obsInfo.detector_num, 

462 universe=self.universe) 

463 return RawFileDatasetInfo(obsInfo=obsInfo, dataId=dataId) 

464 

465 def locateAndReadIndexFiles(self, files): 

466 """Given a list of files, look for index files and read them. 

467 

468 Index files can either be explicitly in the list of files to 

469 ingest, or else located in the same directory as a file to ingest. 

470 Index entries are always used if present. 

471 

472 Parameters 

473 ---------- 

474 files : iterable over `ButlerURI` 

475 URIs to the files to be ingested. 

476 

477 Returns 

478 ------- 

479 index : `dict` [`str`, Any] 

480 Merged contents of all relevant index files found. These can 

481 be explicitly specified index files or ones found in the 

482 directory alongside a data file to be ingested. 

483 updated_files : iterable of `str` 

484 Updated list of the input files with entries removed that were 

485 found listed in an index file. Order is not guaranteed to 

486 match the order of the files given to this routine. 

487 bad_index_files: `set[str]` 

488 Files that looked like index files but failed to read properly. 

489 """ 

490 # Convert the paths to absolute for easy comparison with index content. 

491 # Do not convert to real paths since we have to assume that index 

492 # files are in this location and not the location which it links to. 

493 files = tuple(f.abspath() for f in files) 

494 

495 # Index files must be named this. 

496 index_root_file = "_index.json" 

497 

498 # Group the files by directory. 

499 files_by_directory = defaultdict(set) 

500 

501 for path in files: 

502 directory, file_in_dir = path.split() 

503 files_by_directory[directory].add(file_in_dir) 

504 

505 # All the metadata read from index files with keys of full path. 

506 index_entries = {} 

507 

508 # Index files we failed to read. 

509 bad_index_files = set() 

510 

511 # Any good index files that were found and used. 

512 good_index_files = set() 

513 

514 # Look for index files in those directories. 

515 for directory, files_in_directory in files_by_directory.items(): 

516 possible_index_file = directory.join(index_root_file) 

517 if possible_index_file.exists(): 

518 # If we are explicitly requesting an index file the 

519 # messages should be different. 

520 index_msg = "inferred" 

521 is_implied = True 

522 if index_root_file in files_in_directory: 

523 index_msg = "explicit" 

524 is_implied = False 

525 

526 # Try to read the index file and catch and report any 

527 # problems. 

528 try: 

529 content = json.loads(possible_index_file.read()) 

530 index = process_index_data(content, force_dict=True) 

531 except Exception as e: 

532 # Only trigger the callback if the index file 

533 # was asked for explicitly. Triggering on implied file 

534 # might be surprising. 

535 if not is_implied: 

536 self._on_metadata_failure(possible_index_file, e) 

537 if self.config.failFast: 

538 raise RuntimeError(f"Problem reading index file from {index_msg} " 

539 f"location {possible_index_file}") from e 

540 bad_index_files.add(possible_index_file) 

541 continue 

542 

543 self.log.debug("Extracted index metadata from %s file %s", index_msg, possible_index_file) 

544 good_index_files.add(possible_index_file) 

545 

546 # Go through the index adding entries for files. 

547 # If we have non-index files in this directory marked for 

548 # ingest we should only get index information for those. 

549 # If the index file was explicit we use all entries. 

550 if is_implied: 

551 files_to_ingest = files_in_directory 

552 else: 

553 files_to_ingest = set(index) 

554 

555 # Copy relevant metadata into a single dict for all index 

556 # entries. 

557 for file_in_dir in files_to_ingest: 

558 # Skip an explicitly specified index file. 

559 # This should never happen because an explicit index 

560 # file will force ingest of all files in the index 

561 # and not use the explicit file list. If somehow 

562 # this is not true we continue. Raising an exception 

563 # seems like the wrong thing to do since this is harmless. 

564 if file_in_dir == index_root_file: 

565 self.log.info("Logic error found scanning directory %s. Please file ticket.", 

566 directory) 

567 continue 

568 if file_in_dir in index: 

569 file = directory.join(file_in_dir) 

570 if file in index_entries: 

571 # ObservationInfo overrides raw metadata 

572 if isinstance(index[file_in_dir], ObservationInfo) \ 

573 and not isinstance(index_entries[file], ObservationInfo): 

574 self.log.warning("File %s already specified in an index file but overriding" 

575 " with ObservationInfo content from %s", 

576 file, possible_index_file) 

577 else: 

578 self.log.warning("File %s already specified in an index file, " 

579 "ignoring content from %s", file, possible_index_file) 

580 # Do nothing in this case 

581 continue 

582 

583 index_entries[file] = index[file_in_dir] 

584 

585 # Remove files from list that have index entries and also 

586 # any files that we determined to be explicit index files 

587 # or any index files that we failed to read. 

588 filtered = set(files) - set(index_entries) - good_index_files - bad_index_files 

589 

590 # The filtered list loses the initial order. Retaining the order 

591 # is good for testing but does have a cost if there are many 

592 # files when copying the good values out. A dict would have faster 

593 # lookups (using the files as keys) but use more memory. 

594 ordered = [f for f in filtered if f in files] 

595 

596 return index_entries, ordered, good_index_files, bad_index_files 

597 

598 def processIndexEntries(self, index_entries): 

599 """Convert index entries to RawFileData. 

600 

601 Parameters 

602 ---------- 

603 index_entries : `dict` [`str`, Any] 

604 Dict indexed by name of file to ingest and with keys either 

605 raw metadata or translated 

606 `~astro_metadata_translator.ObservationInfo`. 

607 

608 Returns 

609 ------- 

610 data : `RawFileData` 

611 A structure containing the metadata extracted from the file, 

612 as well as the original filename. All fields will be populated, 

613 but the `RawFileData.dataId` attribute will be a minimal 

614 (unexpanded) `~lsst.daf.butler.DataCoordinate` instance. 

615 """ 

616 fileData = [] 

617 for filename, metadata in index_entries.items(): 

618 try: 

619 datasets = [self._calculate_dataset_info(metadata, filename)] 

620 except Exception as e: 

621 self.log.debug("Problem extracting metadata for file %s found in index file: %s", 

622 filename, e) 

623 datasets = [] 

624 formatterClass = Formatter 

625 instrument = None 

626 self._on_metadata_failure(filename, e) 

627 if self.config.failFast: 

628 raise RuntimeError(f"Problem extracting metadata for file {filename} " 

629 "found in index file") from e 

630 else: 

631 instrument, formatterClass = self._determine_instrument_formatter(datasets[0].dataId, 

632 filename) 

633 if instrument is None: 

634 datasets = [] 

635 fileData.append(RawFileData(datasets=datasets, filename=filename, 

636 FormatterClass=formatterClass, instrumentClass=instrument)) 

637 return fileData 

638 

639 def groupByExposure(self, files: Iterable[RawFileData]) -> List[RawExposureData]: 

640 """Group an iterable of `RawFileData` by exposure. 

641 

642 Parameters 

643 ---------- 

644 files : iterable of `RawFileData` 

645 File-level information to group. 

646 

647 Returns 

648 ------- 

649 exposures : `list` of `RawExposureData` 

650 A list of structures that group the file-level information by 

651 exposure. All fields will be populated. The 

652 `RawExposureData.dataId` attributes will be minimal (unexpanded) 

653 `~lsst.daf.butler.DataCoordinate` instances. 

654 """ 

655 exposureDimensions = self.universe["exposure"].graph 

656 byExposure = defaultdict(list) 

657 for f in files: 

658 # Assume that the first dataset is representative for the file. 

659 byExposure[f.datasets[0].dataId.subset(exposureDimensions)].append(f) 

660 

661 return [RawExposureData(dataId=dataId, files=exposureFiles, universe=self.universe) 

662 for dataId, exposureFiles in byExposure.items()] 

663 

664 def expandDataIds(self, data: RawExposureData) -> RawExposureData: 

665 """Expand the data IDs associated with a raw exposure. 

666 

667 This adds the metadata records. 

668 

669 Parameters 

670 ---------- 

671 exposure : `RawExposureData` 

672 A structure containing information about the exposure to be 

673 ingested. Must have `RawExposureData.records` populated. Should 

674 be considered consumed upon return. 

675 

676 Returns 

677 ------- 

678 exposure : `RawExposureData` 

679 An updated version of the input structure, with 

680 `RawExposureData.dataId` and nested `RawFileData.dataId` attributes 

681 updated to data IDs for which 

682 `~lsst.daf.butler.DataCoordinate.hasRecords` returns `True`. 

683 """ 

684 # We start by expanded the exposure-level data ID; we won't use that 

685 # directly in file ingest, but this lets us do some database lookups 

686 # once per exposure instead of once per file later. 

687 data.dataId = self.butler.registry.expandDataId( 

688 data.dataId, 

689 # We pass in the records we'll be inserting shortly so they aren't 

690 # looked up from the database. We do expect instrument and filter 

691 # records to be retrieved from the database here (though the 

692 # Registry may cache them so there isn't a lookup every time). 

693 records={ 

694 self.butler.registry.dimensions["exposure"]: data.record, 

695 } 

696 ) 

697 # Now we expand the per-file (exposure+detector) data IDs. This time 

698 # we pass in the records we just retrieved from the exposure data ID 

699 # expansion. 

700 for file in data.files: 

701 for dataset in file.datasets: 

702 dataset.dataId = self.butler.registry.expandDataId( 

703 dataset.dataId, 

704 records=dict(data.dataId.records) 

705 ) 

706 return data 

707 

708 def prep(self, files, *, pool: Optional[Pool] = None, processes: int = 1 

709 ) -> Tuple[Iterator[RawExposureData], List[str]]: 

710 """Perform all non-database-updating ingest preprocessing steps. 

711 

712 Parameters 

713 ---------- 

714 files : iterable over `str` or path-like objects 

715 Paths to the files to be ingested. Will be made absolute 

716 if they are not already. 

717 pool : `multiprocessing.Pool`, optional 

718 If not `None`, a process pool with which to parallelize some 

719 operations. 

720 processes : `int`, optional 

721 The number of processes to use. Ignored if ``pool`` is not `None`. 

722 

723 Returns 

724 ------- 

725 exposures : `Iterator` [ `RawExposureData` ] 

726 Data structures containing dimension records, filenames, and data 

727 IDs to be ingested (one structure for each exposure). 

728 bad_files : `list` of `str` 

729 List of all the files that could not have metadata extracted. 

730 """ 

731 if pool is None and processes > 1: 

732 pool = Pool(processes) 

733 mapFunc = map if pool is None else pool.imap_unordered 

734 

735 def _partition_good_bad(file_data: Iterable[RawFileData]) -> Tuple[List[RawFileData], List[str]]: 

736 """Filter out bad files and return good with list of bad.""" 

737 good_files = [] 

738 bad_files = [] 

739 for fileDatum in self.progress.wrap(file_data, desc="Reading image metadata", total=len(files)): 

740 if not fileDatum.datasets: 

741 bad_files.append(fileDatum.filename) 

742 else: 

743 good_files.append(fileDatum) 

744 return good_files, bad_files 

745 

746 # Look for index files and read them. 

747 # There should be far fewer index files than data files. 

748 index_entries, files, good_index_files, bad_index_files = self.locateAndReadIndexFiles(files) 

749 if bad_index_files: 

750 self.log.info("Failed to read the following explicitly requested index files:"), 

751 for bad in sorted(bad_index_files): 

752 self.log.info("- %s", bad) 

753 

754 # Now convert all the index file entries to standard form for ingest. 

755 bad_index_file_data = [] 

756 indexFileData = self.processIndexEntries(index_entries) 

757 if indexFileData: 

758 indexFileData, bad_index_file_data = _partition_good_bad(indexFileData) 

759 self.log.info("Successfully extracted metadata for %d file%s found in %d index file%s" 

760 " with %d failure%s", 

761 *_log_msg_counter(indexFileData), 

762 *_log_msg_counter(good_index_files), 

763 *_log_msg_counter(bad_index_file_data)) 

764 

765 # Extract metadata and build per-detector regions. 

766 # This could run in a subprocess so collect all output 

767 # before looking at failures. 

768 fileData: Iterator[RawFileData] = mapFunc(self.extractMetadata, files) 

769 

770 # Filter out all the failed reads and store them for later 

771 # reporting. 

772 fileData, bad_files = _partition_good_bad(fileData) 

773 self.log.info("Successfully extracted metadata from %d file%s with %d failure%s", 

774 *_log_msg_counter(fileData), 

775 *_log_msg_counter(bad_files)) 

776 

777 # Combine with data from index files. 

778 fileData.extend(indexFileData) 

779 bad_files.extend(bad_index_file_data) 

780 bad_files.extend(bad_index_files) 

781 

782 # Use that metadata to group files (and extracted metadata) by 

783 # exposure. Never parallelized because it's intrinsically a gather 

784 # step. 

785 exposureData: List[RawExposureData] = self.groupByExposure(fileData) 

786 

787 # The next operation operates on RawExposureData instances (one at 

788 # a time) in-place and then returns the modified instance. We call it 

789 # as a pass-through instead of relying on the arguments we pass in to 

790 # have been modified because in the parallel case those arguments are 

791 # going to be pickled and unpickled, and I'm not certain 

792 # multiprocessing is careful enough with that for output arguments to 

793 # work. 

794 

795 # Expand the data IDs to include all dimension metadata; we need this 

796 # because we may need to generate path templates that rely on that 

797 # metadata. 

798 # This is the first step that involves actual database calls (but just 

799 # SELECTs), so if there's going to be a problem with connections vs. 

800 # multiple processes, or lock contention (in SQLite) slowing things 

801 # down, it'll happen here. 

802 return mapFunc(self.expandDataIds, exposureData), bad_files 

803 

804 def ingestExposureDatasets(self, exposure: RawExposureData, *, run: Optional[str] = None 

805 ) -> List[FileDataset]: 

806 """Ingest all raw files in one exposure. 

807 

808 Parameters 

809 ---------- 

810 exposure : `RawExposureData` 

811 A structure containing information about the exposure to be 

812 ingested. Must have `RawExposureData.records` populated and all 

813 data ID attributes expanded. 

814 run : `str`, optional 

815 Name of a RUN-type collection to write to, overriding 

816 ``self.butler.run``. 

817 

818 Returns 

819 ------- 

820 datasets : `list` of `lsst.daf.butler.FileDataset` 

821 Per-file structures identifying the files ingested and their 

822 dataset representation in the data repository. 

823 """ 

824 datasets = [FileDataset(path=file.filename.abspath(), 

825 refs=[DatasetRef(self.datasetType, d.dataId) for d in file.datasets], 

826 formatter=file.FormatterClass) 

827 for file in exposure.files] 

828 self.butler.ingest(*datasets, transfer=self.config.transfer, run=run) 

829 return datasets 

830 

831 def ingestFiles(self, files, *, pool: Optional[Pool] = None, processes: int = 1, 

832 run: Optional[str] = None): 

833 """Ingest files into a Butler data repository. 

834 

835 This creates any new exposure or visit Dimension entries needed to 

836 identify the ingested files, creates new Dataset entries in the 

837 Registry and finally ingests the files themselves into the Datastore. 

838 Any needed instrument, detector, and physical_filter Dimension entries 

839 must exist in the Registry before `run` is called. 

840 

841 Parameters 

842 ---------- 

843 files : iterable over `ButlerURI` 

844 URIs to the files to be ingested. 

845 pool : `multiprocessing.Pool`, optional 

846 If not `None`, a process pool with which to parallelize some 

847 operations. 

848 processes : `int`, optional 

849 The number of processes to use. Ignored if ``pool`` is not `None`. 

850 run : `str`, optional 

851 Name of a RUN-type collection to write to, overriding 

852 the default derived from the instrument name. 

853 

854 Returns 

855 ------- 

856 refs : `list` of `lsst.daf.butler.DatasetRef` 

857 Dataset references for ingested raws. 

858 """ 

859 

860 exposureData, bad_files = self.prep(files, pool=pool, processes=processes) 

861 

862 # Up to this point, we haven't modified the data repository at all. 

863 # Now we finally do that, with one transaction per exposure. This is 

864 # not parallelized at present because the performance of this step is 

865 # limited by the database server. That may or may not change in the 

866 # future once we increase our usage of bulk inserts and reduce our 

867 # usage of savepoints; we've tried to get everything but the database 

868 # operations done in advance to reduce the time spent inside 

869 # transactions. 

870 self.butler.registry.registerDatasetType(self.datasetType) 

871 

872 refs = [] 

873 runs = set() 

874 n_exposures = 0 

875 n_exposures_failed = 0 

876 n_ingests_failed = 0 

877 for exposure in self.progress.wrap(exposureData, desc="Ingesting raw exposures"): 

878 

879 self.log.debug("Attempting to ingest %d file%s from exposure %s:%s", 

880 *_log_msg_counter(exposure.files), 

881 exposure.record.instrument, exposure.record.obs_id) 

882 

883 try: 

884 self.butler.registry.syncDimensionData("exposure", exposure.record) 

885 except Exception as e: 

886 self._on_ingest_failure(exposure, e) 

887 n_exposures_failed += 1 

888 self.log.warning("Exposure %s:%s could not be registered: %s", 

889 exposure.record.instrument, exposure.record.obs_id, e) 

890 if self.config.failFast: 

891 raise e 

892 continue 

893 

894 # Override default run if nothing specified explicitly. 

895 if run is None: 

896 instrumentClass = exposure.files[0].instrumentClass 

897 this_run = instrumentClass.makeDefaultRawIngestRunName() 

898 else: 

899 this_run = run 

900 if this_run not in runs: 

901 self.butler.registry.registerCollection(this_run, type=CollectionType.RUN) 

902 runs.add(this_run) 

903 try: 

904 with self.butler.transaction(): 

905 datasets_for_exposure = self.ingestExposureDatasets(exposure, run=this_run) 

906 except Exception as e: 

907 self._on_ingest_failure(exposure, e) 

908 n_ingests_failed += 1 

909 self.log.warning("Failed to ingest the following for reason: %s", e) 

910 for f in exposure.files: 

911 self.log.warning("- %s", f.filename) 

912 if self.config.failFast: 

913 raise e 

914 continue 

915 else: 

916 self._on_success(datasets_for_exposure) 

917 for dataset in datasets_for_exposure: 

918 refs.extend(dataset.refs) 

919 

920 # Success for this exposure. 

921 n_exposures += 1 

922 self.log.info("Exposure %s:%s ingested successfully", 

923 exposure.record.instrument, exposure.record.obs_id) 

924 

925 return refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed 

926 

927 @timeMethod 

928 def run(self, files, *, pool: Optional[Pool] = None, processes: int = 1, run: Optional[str] = None, 

929 file_filter: Union[str, re.Pattern] = r"\.fit[s]?\b", group_files: bool = True): 

930 """Ingest files into a Butler data repository. 

931 

932 This creates any new exposure or visit Dimension entries needed to 

933 identify the ingested files, creates new Dataset entries in the 

934 Registry and finally ingests the files themselves into the Datastore. 

935 Any needed instrument, detector, and physical_filter Dimension entries 

936 must exist in the Registry before `run` is called. 

937 

938 Parameters 

939 ---------- 

940 files : iterable over `ButlerURI`, `str` or path-like objects 

941 Paths to the files to be ingested. Can refer to directories. 

942 Will be made absolute if they are not already. 

943 pool : `multiprocessing.Pool`, optional 

944 If not `None`, a process pool with which to parallelize some 

945 operations. 

946 processes : `int`, optional 

947 The number of processes to use. Ignored if ``pool`` is not `None`. 

948 run : `str`, optional 

949 Name of a RUN-type collection to write to, overriding 

950 the default derived from the instrument name. 

951 file_filter : `str` or `re.Pattern`, optional 

952 Pattern to use to discover files to ingest within directories. 

953 The default is to search for FITS files. The regex applies to 

954 files within the directory. 

955 group_files : `bool`, optional 

956 Group files by directory if they have been discovered in 

957 directories. Will not affect files explicitly provided. 

958 

959 Returns 

960 ------- 

961 refs : `list` of `lsst.daf.butler.DatasetRef` 

962 Dataset references for ingested raws. 

963 

964 Notes 

965 ----- 

966 This method inserts all datasets for an exposure within a transaction, 

967 guaranteeing that partial exposures are never ingested. The exposure 

968 dimension record is inserted with `Registry.syncDimensionData` first 

969 (in its own transaction), which inserts only if a record with the same 

970 primary key does not already exist. This allows different files within 

971 the same exposure to be incremented in different runs. 

972 """ 

973 

974 refs = [] 

975 bad_files = [] 

976 n_exposures = 0 

977 n_exposures_failed = 0 

978 n_ingests_failed = 0 

979 if group_files: 

980 for group in ButlerURI.findFileResources(files, file_filter, group_files): 

981 new_refs, bad, n_exp, n_exp_fail, n_ingest_fail = self.ingestFiles(group, pool=pool, 

982 processes=processes, 

983 run=run) 

984 refs.extend(new_refs) 

985 bad_files.extend(bad) 

986 n_exposures += n_exp 

987 n_exposures_failed += n_exp_fail 

988 n_ingests_failed += n_ingest_fail 

989 else: 

990 refs, bad_files, n_exposures, n_exposures_failed, n_ingests_failed = self.ingestFiles( 

991 ButlerURI.findFileResources(files, file_filter, group_files), 

992 pool=pool, 

993 processes=processes, 

994 run=run, 

995 ) 

996 

997 had_failure = False 

998 

999 if bad_files: 

1000 had_failure = True 

1001 self.log.warning("Could not extract observation metadata from the following:") 

1002 for f in bad_files: 

1003 self.log.warning("- %s", f) 

1004 

1005 self.log.info("Successfully processed data from %d exposure%s with %d failure%s from exposure" 

1006 " registration and %d failure%s from file ingest.", 

1007 *_log_msg_counter(n_exposures), 

1008 *_log_msg_counter(n_exposures_failed), 

1009 *_log_msg_counter(n_ingests_failed)) 

1010 if n_exposures_failed > 0 or n_ingests_failed > 0: 

1011 had_failure = True 

1012 self.log.info("Ingested %d distinct Butler dataset%s", *_log_msg_counter(refs)) 

1013 

1014 if had_failure: 

1015 raise RuntimeError("Some failures encountered during ingestion") 

1016 

1017 return refs