Coverage for python/lsst/summit/utils/butlerUtils.py: 13%
240 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-10 12:45 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2023-03-10 12:45 +0000
1# This file is part of summit_utils.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import lsst.daf.butler as dafButler
23import itertools
24import copy
26from lsst.summit.utils.utils import getSite
29__all__ = ["makeDefaultLatissButler",
30 "updateDataId",
31 "sanitize_day_obs",
32 "getMostRecentDayObs",
33 "getSeqNumsForDayObs",
34 "getMostRecentDataId",
35 "getDatasetRefForDataId",
36 "getDayObs",
37 "getSeqNum",
38 "getExpId",
39 "datasetExists",
40 "sortRecordsByDayObsThenSeqNum",
41 "getDaysWithData",
42 "getExpIdFromDayObsSeqNum",
43 "updateDataIdOrDataCord",
44 "fillDataId",
45 "getExpRecordFromDataId",
46 "getDayObsSeqNumFromExposureId",
47 "removeDataProduct",
48 "getLatissOnSkyDataIds",
49 ]
51_LATISS_DEFAULT_COLLECTIONS = ['LATISS/raw/all', 'LATISS/calib', "LATISS/runs/quickLook"]
53# RECENT_DAY must be in the past *and have data* (otherwise some tests are
54# no-ops), to speed up queries by restricting them significantly,
55# but data must definitely been taken since. Should
56# also not be more than 2 months in the past due to 60 day lookback time on the
57# summit. All this means it should be updated by an informed human.
58RECENT_DAY = 20220503
61def _configureForSite():
62 try:
63 site = getSite()
64 except ValueError:
65 # this method is run automatically on module import, so
66 # don't fail for k8s where this cannot yet be determined
67 print("WARNING: failed to automatically determine site")
68 site = None
70 if site == 'tucson': 70 ↛ 72line 70 didn't jump to line 72, because the condition on line 70 was never true
71 global RECENT_DAY
72 RECENT_DAY = 20211104 # TTS has limited data, so use this day
75_configureForSite()
78def getLatissDefaultCollections():
79 """Get the default set of LATISS collections, updated for the site at
80 which the code is being run.
82 Returns
83 -------
84 collections : `list` of `str`
85 The default collections for the site.
86 """
87 collections = _LATISS_DEFAULT_COLLECTIONS
88 try:
89 site = getSite()
90 except ValueError:
91 site = ''
93 if site == 'tucson':
94 collections.append("LATISS-test-data")
95 return collections
96 if site == 'summit':
97 collections.append("LATISS-test-data")
98 return collections
99 return collections
102def _update_RECENT_DAY(day):
103 """Update the value for RECENT_DAY once we have a value for free."""
104 global RECENT_DAY
105 RECENT_DAY = max(day-1, RECENT_DAY)
108def makeDefaultLatissButler(*, extraCollections=None, writeable=False, embargo=False):
109 """Create a butler for LATISS using the default collections.
111 Parameters
112 ----------
113 extraCollections : `list` of `str`
114 Extra input collections to supply to the butler init.
115 writable : `bool`, optional
116 Whether to make a writable butler.
117 embargo : `bool`, optional
118 Use the embargo repo instead of the main one. Needed to access
119 embargoed data.
121 Returns
122 -------
123 butler : `lsst.daf.butler.Butler`
124 The butler.
125 """
126 # TODO: Add logging to which collections are going in
127 collections = getLatissDefaultCollections()
128 if extraCollections:
129 collections.extend(extraCollections)
130 try:
131 repoString = "LATISS" if not embargo else "/repo/embargo"
132 butler = dafButler.Butler(repoString,
133 collections=collections,
134 writeable=writeable,
135 instrument='LATISS')
136 except(FileNotFoundError, RuntimeError):
137 # Depending on the value of DAF_BUTLER_REPOSITORY_INDEX and whether
138 # it is present and blank, or just not set, both these exception
139 # types can be raised, see tests/test_butlerUtils.py:ButlerInitTestCase
140 # for details and tests which confirm these have not changed
141 raise FileNotFoundError # unify exception type
142 return butler
145# TODO: DM-32940 can remove this whole function once this ticket merges.
146def datasetExists(butler, dataProduct, dataId, **kwargs):
147 """Collapse the tri-state behaviour of butler.datasetExists to a boolean.
149 Parameters
150 ----------
151 butler : `lsst.daf.butler.Butler`
152 The butler
153 dataProduct : `str`
154 The type of data product to check for
155 dataId : `dict`
156 The dataId of the dataProduct to check for
158 Returns
159 -------
160 exists : `bool`
161 True if the dataProduct exists for the dataProduct and can be retreived
162 else False.
163 """
164 try:
165 exists = butler.datasetExists(dataProduct, dataId, **kwargs)
166 return exists
167 except (LookupError, RuntimeError):
168 return False
171def updateDataId(dataId, **kwargs):
172 """Update a DataCoordinate or dataId dict with kwargs.
174 Provides a single interface for adding the detector key (or others) to a
175 dataId whether it's a DataCoordinate or a dict
177 Parameters
178 ----------
179 dataId : `dict` or `lsst.daf.butler.DataCoordinate`
180 The dataId to update.
181 kwargs : `dict`
182 The keys and values to add to the dataId.
184 Returns
185 -------
186 dataId : `dict` or `lsst.daf.butler.DataCoordinate`
187 The updated dataId, with the same type as the input.
188 """
190 match dataId:
191 case dafButler.DataCoordinate():
192 return dafButler.DataCoordinate.standardize(dataId, **kwargs)
193 case dict() as dataId:
194 return dict(dataId, **kwargs)
195 raise ValueError(f"Unknown dataId type {type(dataId)}")
198def sanitize_day_obs(day_obs):
199 """Take string or int day_obs and turn it into the int version.
201 Parameters
202 ----------
203 day_obs : `str` or `int`
204 The day_obs to sanitize.
206 Returns
207 -------
208 day_obs : `int`
209 The sanitized day_obs.
211 Raises
212 ------
213 ValueError
214 Raised if the day_obs fails to translate for any reason.
215 """
216 if isinstance(day_obs, int):
217 return day_obs
218 elif isinstance(day_obs, str):
219 try:
220 return int(day_obs.replace('-', ''))
221 except Exception:
222 ValueError(f'Failed to sanitize {day_obs!r} to a day_obs')
223 else:
224 raise ValueError(f'Cannot sanitize {day_obs!r} to a day_obs')
227def getMostRecentDayObs(butler):
228 """Get the most recent day_obs for which there is data.
230 Parameters
231 ----------
232 butler : `lsst.daf.butler.Butler
233 The butler to query.
235 Returns
236 -------
237 day_obs : `int`
238 The day_obs.
239 """
240 where = "exposure.day_obs>=RECENT_DAY"
241 records = butler.registry.queryDimensionRecords('exposure', where=where, datasets='raw',
242 bind={'RECENT_DAY': RECENT_DAY})
243 recentDay = max(r.day_obs for r in records)
244 _update_RECENT_DAY(recentDay)
245 return recentDay
248def getSeqNumsForDayObs(butler, day_obs, extraWhere=''):
249 """Get a list of all seq_nums taken on a given day_obs.
251 Parameters
252 ----------
253 butler : `lsst.daf.butler.Butler
254 The butler to query.
255 day_obs : `int` or `str`
256 The day_obs for which the seq_nums are desired.
257 extraWhere : `str`
258 Any extra where conditions to add to the queryDimensionRecords call.
260 Returns
261 -------
262 seq_nums : `iterable`
263 The seq_nums taken on the corresponding day_obs in ascending numerical
264 order.
265 """
266 day_obs = sanitize_day_obs(day_obs)
267 where = "exposure.day_obs=day_obs"
268 if extraWhere:
269 extraWhere = extraWhere.replace('"', '\'')
270 where += f" and {extraWhere}"
271 records = butler.registry.queryDimensionRecords("exposure",
272 where=where,
273 bind={'day_obs': day_obs},
274 datasets='raw')
275 return sorted([r.seq_num for r in records])
278def sortRecordsByDayObsThenSeqNum(records):
279 """Sort a set of records by dayObs, then seqNum to get the order in which
280 they were taken.
282 Parameters
283 ----------
284 records : `list` of `dict`
285 The records to be sorted.
287 Returns
288 -------
289 sortedRecords : `list` of `dict`
290 The sorted records
292 Raises
293 ------
294 ValueError
295 Raised if the recordSet contains duplicate records, or if it contains
296 (dayObs, seqNum) collisions.
297 """
298 records = list(records) # must call list in case we have a generator
299 recordSet = set(records)
300 if len(records) != len(recordSet):
301 raise ValueError("Record set contains duplicate records and therefore cannot be sorted unambiguously")
303 daySeqTuples = [(r.day_obs, r.seq_num) for r in records]
304 if len(daySeqTuples) != len(set(daySeqTuples)):
305 raise ValueError("Record set contains dayObs/seqNum collisions, and therefore cannot be sorted "
306 "unambiguously")
308 records.sort(key=lambda r: (r.day_obs, r.seq_num))
309 return records
312def getDaysWithData(butler):
313 """Get all the days for which LATISS has taken data on the mountain.
315 Parameters
316 ----------
317 butler : `lsst.daf.butler.Butler
318 The butler to query.
320 Returns
321 -------
322 days : `list` of `int`
323 A sorted list of the day_obs values for which mountain-top data exists.
324 """
325 # 20200101 is a day between shipping LATISS and going on sky
326 # We used to constrain on exposure.seq_num<50 to massively reduce the
327 # number of returned records whilst being large enough to ensure that no
328 # days are missed because early seq_nums were skipped. However, because
329 # we have test datasets like LATISS-test-data-tts where we only kept
330 # seqNums from 950 on one day, we can no longer assume this so don't be
331 # tempted to add such a constraint back in here for speed.
332 where = "exposure.day_obs>20200101"
333 records = butler.registry.queryDimensionRecords("exposure", where=where, datasets='raw')
334 return sorted(set([r.day_obs for r in records]))
337def getMostRecentDataId(butler):
338 """Get the dataId for the most recent observation.
340 Parameters
341 ----------
342 butler : `lsst.daf.butler.Butler
343 The butler to query.
345 Returns
346 -------
347 dataId : `dict`
348 The dataId of the most recent exposure.
349 """
350 lastDay = getMostRecentDayObs(butler)
351 seqNum = getSeqNumsForDayObs(butler, lastDay)[-1]
352 dataId = {'day_obs': lastDay, 'seq_num': seqNum, 'detector': 0}
353 dataId.update(getExpIdFromDayObsSeqNum(butler, dataId))
354 return dataId
357def getExpIdFromDayObsSeqNum(butler, dataId):
358 """Get the exposure id for the dataId.
360 Parameters
361 ----------
362 butler : `lsst.daf.butler.Butler
363 The butler to query.
364 dataId : `dict`
365 The dataId for which to return the exposure id.
367 Returns
368 -------
369 dataId : `dict`
370 The dataId of the most recent exposure.
371 """
372 expRecord = getExpRecordFromDataId(butler, dataId)
373 return {'exposure': expRecord.id}
376def updateDataIdOrDataCord(dataId, **updateKwargs):
377 """Add key, value pairs to a dataId or data coordinate.
379 Parameters
380 ----------
381 dataId : `dict`
382 The dataId for which to return the exposure id.
383 updateKwargs : `dict`
384 The key value pairs add to the dataId or dataCoord.
386 Returns
387 -------
388 dataId : `dict`
389 The updated dataId.
391 Notes
392 -----
393 Always returns a dict, so note that if a data coordinate is supplied, a
394 dict is returned, changing the type.
395 """
396 newId = copy.copy(dataId)
397 newId = _assureDict(newId)
398 newId.update(updateKwargs)
399 return newId
402def fillDataId(butler, dataId):
403 """Given a dataId, fill it with values for all available dimensions.
405 Parameters
406 ----------
407 butler : `lsst.daf.butler.Butler`
408 The butler.
409 dataId : `dict`
410 The dataId to fill.
412 Returns
413 -------
414 dataId : `dict`
415 The filled dataId.
417 Notes
418 -----
419 This function is *slow*! Running this on 20,000 dataIds takes approximately
420 7 minutes. Virtually all the slowdown is in the
421 butler.registry.expandDataId() call though, so this wrapper is not to blame
422 here, and might speed up in future with butler improvements.
423 """
424 # ensure it's a dict to deal with records etc
425 dataId = _assureDict(dataId)
427 # this removes extraneous keys that would trip up the registry call
428 # using _rewrite_data_id is perhaps ever so slightly slower than popping
429 # the bad keys, or making a minimal dataId by hand, but is more
430 # reliable/general, so we choose that over the other approach here
431 dataId, _ = butler._rewrite_data_id(dataId, butler.registry.getDatasetType('raw'))
433 # now expand and turn back to a dict
434 dataId = butler.registry.expandDataId(dataId, detector=0).full # this call is VERY slow
435 dataId = _assureDict(dataId)
437 missingExpId = getExpId(dataId) is None
438 missingDayObs = getDayObs(dataId) is None
439 missingSeqNum = getSeqNum(dataId) is None
441 if missingDayObs or missingSeqNum:
442 dayObsSeqNum = getDayObsSeqNumFromExposureId(butler, dataId)
443 dataId.update(dayObsSeqNum)
445 if missingExpId:
446 expId = getExpIdFromDayObsSeqNum(butler, dataId)
447 dataId.update(expId)
449 return dataId
452def _assureDict(dataId):
453 """Turn any data-identifier-like object into a dict.
455 Parameters
456 ----------
457 dataId : `dict` or `lsst.daf.butler.dimensions.DataCoordinate` or
458 `lsst.daf.butler.dimensions.DimensionRecord`
459 The data identifier.
461 Returns
462 -------
463 dataId : `dict`
464 The data identifier as a dict.
465 """
466 if isinstance(dataId, dict):
467 return dataId
468 elif hasattr(dataId, 'items'): # dafButler.dimensions.DataCoordinate
469 return {str(k): v for k, v in dataId.items()} # str() required due to full names
470 elif hasattr(dataId, 'dataId'): # dafButler.dimensions.DimensionRecord
471 return {str(k): v for k, v in dataId.dataId.items()}
472 else:
473 raise RuntimeError(f'Failed to coerce {type(dataId)} to dict')
476def getExpRecordFromDataId(butler, dataId):
477 """Get the exposure record for a given dataId.
479 Parameters
480 ----------
481 butler : `lsst.daf.butler.Butler`
482 The butler.
483 dataId : `dict`
484 The dataId.
486 Returns
487 -------
488 expRecord : `lsst.daf.butler.dimensions.ExposureRecord`
489 The exposure record.
490 """
491 dataId = _assureDict(dataId)
492 assert isinstance(dataId, dict), f'dataId must be a dict or DimensionRecord, got {type(dataId)}'
494 if expId := getExpId(dataId):
495 where = "exposure.id=expId"
496 expRecords = butler.registry.queryDimensionRecords("exposure",
497 where=where,
498 bind={'expId': expId},
499 datasets='raw')
501 else:
502 dayObs = getDayObs(dataId)
503 seqNum = getSeqNum(dataId)
504 if not (dayObs and seqNum):
505 raise RuntimeError(f'Failed to find either expId or day_obs and seq_num in dataId {dataId}')
506 where = "exposure.day_obs=day_obs AND exposure.seq_num=seq_num"
507 expRecords = butler.registry.queryDimensionRecords("exposure",
508 where=where,
509 bind={'day_obs': dayObs, 'seq_num': seqNum},
510 datasets='raw')
512 expRecords = set(expRecords)
513 if not expRecords:
514 raise LookupError(f"No exposure records found for {dataId}")
515 assert len(expRecords) == 1, f'Found {len(expRecords)} exposure records for {dataId}'
516 return expRecords.pop()
519def getDayObsSeqNumFromExposureId(butler, dataId):
520 """Get the day_obs and seq_num for an exposure id.
522 Parameters
523 ----------
524 butler : `lsst.daf.butler.Butler`
525 The butler.
526 dataId : `dict`
527 The dataId containing the exposure id.
529 Returns
530 -------
531 dataId : `dict`
532 A dict containing only the day_obs and seq_num.
533 """
534 if (dayObs := getDayObs(dataId)) and (seqNum := getSeqNum(dataId)):
535 return {'day_obs': dayObs, 'seq_num': seqNum}
537 if isinstance(dataId, int):
538 dataId = {'exposure': dataId}
539 else:
540 dataId = _assureDict(dataId)
541 assert isinstance(dataId, dict)
543 if not (expId := getExpId(dataId)):
544 raise RuntimeError(f'Failed to find exposure id in {dataId}')
546 where = "exposure.id=expId"
547 expRecords = butler.registry.queryDimensionRecords("exposure",
548 where=where,
549 bind={'expId': expId},
550 datasets='raw')
551 expRecords = set(expRecords)
552 if not expRecords:
553 raise LookupError(f"No exposure records found for {dataId}")
554 assert len(expRecords) == 1, f'Found {len(expRecords)} exposure records for {dataId}'
555 record = expRecords.pop()
556 return {'day_obs': record.day_obs, 'seq_num': record.seq_num}
559def getDatasetRefForDataId(butler, datasetType, dataId):
560 """Get the datasetReference for a dataId.
562 Parameters
563 ----------
564 butler : `lsst.daf.butler.Butler`
565 The butler.
566 datasetType : `str` or `datasetType`
567 The dataset type.
568 dataId : `dict`
569 The dataId.
571 Returns
572 -------
573 datasetRef : `lsst.daf.butler.dimensions.DatasetReference`
574 The dataset reference.
575 """
576 if not _expid_present(dataId):
577 assert _dayobs_present(dataId) and _seqnum_present(dataId)
578 dataId.update(getExpIdFromDayObsSeqNum(butler, dataId))
580 dRef = butler.registry.findDataset(datasetType, dataId)
581 return dRef
584def removeDataProduct(butler, datasetType, dataId):
585 """Remove a data prodcut from the registry. Use with caution.
587 Parameters
588 ----------
589 butler : `lsst.daf.butler.Butler`
590 The butler.
591 datasetType : `str` or `datasetType`
592 The dataset type.
593 dataId : `dict`
594 The dataId.
596 """
597 if datasetType == 'raw':
598 raise RuntimeError("I'm sorry, Dave, I'm afraid I can't do that.")
599 dRef = getDatasetRefForDataId(butler, datasetType, dataId)
600 butler.pruneDatasets([dRef], disassociate=True, unstore=True, purge=True)
601 return
604def _dayobs_present(dataId):
605 return _get_dayobs_key(dataId) is not None
608def _seqnum_present(dataId):
609 return _get_seqnum_key(dataId) is not None
612def _expid_present(dataId):
613 return _get_expid_key(dataId) is not None
616def _get_dayobs_key(dataId):
617 """Return the key for day_obs if present, else None
618 """
619 keys = [k for k in dataId.keys() if k.find('day_obs') != -1]
620 if not keys:
621 return None
622 return keys[0]
625def _get_seqnum_key(dataId):
626 """Return the key for seq_num if present, else None
627 """
628 keys = [k for k in dataId.keys() if k.find('seq_num') != -1]
629 if not keys:
630 return None
631 return keys[0]
634def _get_expid_key(dataId):
635 """Return the key for expId if present, else None
636 """
637 if 'exposure.id' in dataId:
638 return 'exposure.id'
639 elif 'exposure' in dataId:
640 return 'exposure'
641 return None
644def getDayObs(dataId):
645 """Get the day_obs from a dataId.
647 Parameters
648 ----------
649 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
650 The dataId.
652 Returns
653 -------
654 day_obs : `int` or `None`
655 The day_obs value if present, else None.
656 """
657 if hasattr(dataId, 'day_obs'):
658 return getattr(dataId, 'day_obs')
659 if not _dayobs_present(dataId):
660 return None
661 return dataId['day_obs'] if 'day_obs' in dataId else dataId['exposure.day_obs']
664def getSeqNum(dataId):
665 """Get the seq_num from a dataId.
667 Parameters
668 ----------
669 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
670 The dataId.
672 Returns
673 -------
674 seq_num : `int` or `None`
675 The seq_num value if present, else None.
676 """
677 if hasattr(dataId, 'seq_num'):
678 return getattr(dataId, 'seq_num')
679 if not _seqnum_present(dataId):
680 return None
681 return dataId['seq_num'] if 'seq_num' in dataId else dataId['exposure.seq_num']
684def getExpId(dataId):
685 """Get the expId from a dataId.
687 Parameters
688 ----------
689 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
690 The dataId.
692 Returns
693 -------
694 expId : `int` or `None`
695 The expId value if present, else None.
696 """
697 if hasattr(dataId, 'id'):
698 return getattr(dataId, 'id')
699 if not _expid_present(dataId):
700 return None
701 return dataId['exposure'] if 'exposure' in dataId else dataId['exposure.id']
704def getLatissOnSkyDataIds(butler, skipTypes=('bias', 'dark', 'flat'), checkObject=True, full=True,
705 startDate=None, endDate=None):
706 """Get a list of all on-sky dataIds taken.
708 Parameters
709 ----------
710 butler : `lsst.daf.butler.Butler`
711 The butler.
712 skipTypes : `list` of `str`
713 Image types to exclude.
714 checkObject : `bool`
715 Check if the value of target_name (formerly OBJECT) is set and exlude
716 if it is not.
717 full : `bool`
718 Return filled dataIds. Required for some analyses, but runs much
719 (~30x) slower.
720 startDate : `int`
721 The day_obs to start at, inclusive.
722 endDate : `int`
723 The day_obs to end at, inclusive.
725 Returns
726 -------
727 dataIds : `list` or `dataIds`
728 The dataIds.
729 """
730 def isOnSky(expRecord):
731 imageType = expRecord.observation_type
732 obj = expRecord.target_name
733 if checkObject and obj == 'NOTSET':
734 return False
735 if imageType not in skipTypes:
736 return True
737 return False
739 recordSets = []
740 days = getDaysWithData(butler)
741 if startDate:
742 days = [d for d in days if d >= startDate]
743 if endDate:
744 days = [d for d in days if d <= endDate]
745 days = sorted(set(days))
747 where = "exposure.day_obs=day_obs"
748 for day in days:
749 # queryDataIds would be better here, but it's then hard/impossible
750 # to do the filtering for which is on sky, so just take the dataIds
751 records = butler.registry.queryDimensionRecords("exposure",
752 where=where,
753 bind={'day_obs': day},
754 datasets='raw')
755 recordSets.append(sortRecordsByDayObsThenSeqNum(records))
757 dataIds = [r.dataId for r in filter(isOnSky, itertools.chain(*recordSets))]
758 if full:
759 expandedIds = [updateDataIdOrDataCord(butler.registry.expandDataId(dataId, detector=0).full)
760 for dataId in dataIds]
761 filledIds = [fillDataId(butler, dataId) for dataId in expandedIds]
762 return filledIds
763 else:
764 return [updateDataIdOrDataCord(dataId, detector=0) for dataId in dataIds]