Coverage for python/lsst/summit/utils/butlerUtils.py: 13%
240 statements
« prev ^ index » next coverage.py v7.2.6, created at 2023-05-24 03:00 -0700
« prev ^ index » next coverage.py v7.2.6, created at 2023-05-24 03:00 -0700
1# This file is part of summit_utils.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import lsst.daf.butler as dafButler
23import itertools
24import copy
26from lsst.summit.utils.utils import getSite
29__all__ = ["makeDefaultLatissButler",
30 "updateDataId",
31 "sanitize_day_obs",
32 "getMostRecentDayObs",
33 "getSeqNumsForDayObs",
34 "getMostRecentDataId",
35 "getDatasetRefForDataId",
36 "getDayObs",
37 "getSeqNum",
38 "getExpId",
39 "datasetExists",
40 "sortRecordsByDayObsThenSeqNum",
41 "getDaysWithData",
42 "getExpIdFromDayObsSeqNum",
43 "updateDataIdOrDataCord",
44 "fillDataId",
45 "getExpRecordFromDataId",
46 "getDayObsSeqNumFromExposureId",
47 "removeDataProduct",
48 "getLatissOnSkyDataIds",
49 ]
51_LATISS_DEFAULT_COLLECTIONS = ['LATISS/raw/all', 'LATISS/calib', "LATISS/runs/quickLook"]
53# RECENT_DAY must be in the past *and have data* (otherwise some tests are
54# no-ops), to speed up queries by restricting them significantly,
55# but data must definitely been taken since. Should
56# also not be more than 2 months in the past due to 60 day lookback time on the
57# summit. All this means it should be updated by an informed human.
58RECENT_DAY = 20220503
61def _configureForSite():
62 try:
63 site = getSite()
64 except ValueError:
65 # this method is run automatically on module import, so
66 # don't fail for k8s where this cannot yet be determined
67 print("WARNING: failed to automatically determine site")
68 site = None
70 if site == 'tucson': 70 ↛ 72line 70 didn't jump to line 72, because the condition on line 70 was never true
71 global RECENT_DAY
72 RECENT_DAY = 20211104 # TTS has limited data, so use this day
75_configureForSite()
78def getLatissDefaultCollections():
79 """Get the default set of LATISS collections, updated for the site at
80 which the code is being run.
82 Returns
83 -------
84 collections : `list` of `str`
85 The default collections for the site.
86 """
87 collections = _LATISS_DEFAULT_COLLECTIONS
88 try:
89 site = getSite()
90 except ValueError:
91 site = ''
93 if site == 'tucson':
94 collections.append("LATISS-test-data")
95 return collections
96 if site == 'summit':
97 collections.append("LATISS-test-data")
98 return collections
99 return collections
102def _update_RECENT_DAY(day):
103 """Update the value for RECENT_DAY once we have a value for free."""
104 global RECENT_DAY
105 RECENT_DAY = max(day-1, RECENT_DAY)
108def makeDefaultLatissButler(*, extraCollections=None, writeable=False, embargo=False):
109 """Create a butler for LATISS using the default collections.
111 Parameters
112 ----------
113 extraCollections : `list` of `str`
114 Extra input collections to supply to the butler init.
115 writable : `bool`, optional
116 Whether to make a writable butler.
117 embargo : `bool`, optional
118 Use the embargo repo instead of the main one. Needed to access
119 embargoed data.
121 Returns
122 -------
123 butler : `lsst.daf.butler.Butler`
124 The butler.
125 """
126 # TODO: Add logging to which collections are going in
127 collections = getLatissDefaultCollections()
128 if extraCollections:
129 collections.extend(extraCollections)
130 try:
131 repoString = "LATISS" if not embargo else "/repo/embargo"
132 butler = dafButler.Butler(repoString,
133 collections=collections,
134 writeable=writeable,
135 instrument='LATISS')
136 except(FileNotFoundError, RuntimeError):
137 # Depending on the value of DAF_BUTLER_REPOSITORY_INDEX and whether
138 # it is present and blank, or just not set, both these exception
139 # types can be raised, see tests/test_butlerUtils.py:ButlerInitTestCase
140 # for details and tests which confirm these have not changed
141 raise FileNotFoundError # unify exception type
142 return butler
145# TODO: DM-32940 can remove this whole function once this ticket merges.
146def datasetExists(butler, dataProduct, dataId, **kwargs):
147 """Collapse the tri-state behaviour of butler.datasetExists to a boolean.
149 Parameters
150 ----------
151 butler : `lsst.daf.butler.Butler`
152 The butler
153 dataProduct : `str`
154 The type of data product to check for
155 dataId : `dict`
156 The dataId of the dataProduct to check for
158 Returns
159 -------
160 exists : `bool`
161 True if the dataProduct exists for the dataProduct and can be retreived
162 else False.
163 """
164 try:
165 exists = butler.datasetExists(dataProduct, dataId, **kwargs)
166 return exists
167 except (LookupError, RuntimeError):
168 return False
171def updateDataId(dataId, **kwargs):
172 """Update a DataCoordinate or dataId dict with kwargs.
174 Provides a single interface for adding the detector key (or others) to a
175 dataId whether it's a DataCoordinate or a dict
177 Parameters
178 ----------
179 dataId : `dict` or `lsst.daf.butler.DataCoordinate`
180 The dataId to update.
181 kwargs : `dict`
182 The keys and values to add to the dataId.
184 Returns
185 -------
186 dataId : `dict` or `lsst.daf.butler.DataCoordinate`
187 The updated dataId, with the same type as the input.
188 """
190 match dataId:
191 case dafButler.DataCoordinate():
192 return dafButler.DataCoordinate.standardize(dataId, **kwargs)
193 case dict() as dataId:
194 return dict(dataId, **kwargs)
195 raise ValueError(f"Unknown dataId type {type(dataId)}")
198def sanitize_day_obs(day_obs):
199 """Take string or int day_obs and turn it into the int version.
201 Parameters
202 ----------
203 day_obs : `str` or `int`
204 The day_obs to sanitize.
206 Returns
207 -------
208 day_obs : `int`
209 The sanitized day_obs.
211 Raises
212 ------
213 ValueError
214 Raised if the day_obs fails to translate for any reason.
215 """
216 if isinstance(day_obs, int):
217 return day_obs
218 elif isinstance(day_obs, str):
219 try:
220 return int(day_obs.replace('-', ''))
221 except Exception:
222 ValueError(f'Failed to sanitize {day_obs!r} to a day_obs')
223 else:
224 raise ValueError(f'Cannot sanitize {day_obs!r} to a day_obs')
227def getMostRecentDayObs(butler):
228 """Get the most recent day_obs for which there is data.
230 Parameters
231 ----------
232 butler : `lsst.daf.butler.Butler
233 The butler to query.
235 Returns
236 -------
237 day_obs : `int`
238 The day_obs.
239 """
240 where = "exposure.day_obs>=RECENT_DAY"
241 records = butler.registry.queryDimensionRecords('exposure', where=where, datasets='raw',
242 bind={'RECENT_DAY': RECENT_DAY})
243 recentDay = max(r.day_obs for r in records)
244 _update_RECENT_DAY(recentDay)
245 return recentDay
248def getSeqNumsForDayObs(butler, day_obs, extraWhere=''):
249 """Get a list of all seq_nums taken on a given day_obs.
251 Parameters
252 ----------
253 butler : `lsst.daf.butler.Butler
254 The butler to query.
255 day_obs : `int` or `str`
256 The day_obs for which the seq_nums are desired.
257 extraWhere : `str`
258 Any extra where conditions to add to the queryDimensionRecords call.
260 Returns
261 -------
262 seq_nums : `iterable`
263 The seq_nums taken on the corresponding day_obs in ascending numerical
264 order.
265 """
266 day_obs = sanitize_day_obs(day_obs)
267 where = "exposure.day_obs=day_obs"
268 if extraWhere:
269 extraWhere = extraWhere.replace('"', '\'')
270 where += f" and {extraWhere}"
271 records = butler.registry.queryDimensionRecords("exposure",
272 where=where,
273 bind={'day_obs': day_obs},
274 datasets='raw')
275 return sorted([r.seq_num for r in records])
278def sortRecordsByDayObsThenSeqNum(records):
279 """Sort a set of records by dayObs, then seqNum to get the order in which
280 they were taken.
282 Parameters
283 ----------
284 records : `list` of `dict`
285 The records to be sorted.
287 Returns
288 -------
289 sortedRecords : `list` of `dict`
290 The sorted records
292 Raises
293 ------
294 ValueError
295 Raised if the recordSet contains duplicate records, or if it contains
296 (dayObs, seqNum) collisions.
297 """
298 records = list(records) # must call list in case we have a generator
299 recordSet = set(records)
300 if len(records) != len(recordSet):
301 raise ValueError("Record set contains duplicate records and therefore cannot be sorted unambiguously")
303 daySeqTuples = [(r.day_obs, r.seq_num) for r in records]
304 if len(daySeqTuples) != len(set(daySeqTuples)):
305 raise ValueError("Record set contains dayObs/seqNum collisions, and therefore cannot be sorted "
306 "unambiguously")
308 records.sort(key=lambda r: (r.day_obs, r.seq_num))
309 return records
312def getDaysWithData(butler, datasetType='raw'):
313 """Get all the days for which LATISS has taken data on the mountain.
315 Parameters
316 ----------
317 butler : `lsst.daf.butler.Butler
318 The butler to query.
319 datasetType : `str`
320 The datasetType to query.
322 Returns
323 -------
324 days : `list` of `int`
325 A sorted list of the day_obs values for which mountain-top data exists.
326 """
327 # 20200101 is a day between shipping LATISS and going on sky
328 # We used to constrain on exposure.seq_num<50 to massively reduce the
329 # number of returned records whilst being large enough to ensure that no
330 # days are missed because early seq_nums were skipped. However, because
331 # we have test datasets like LATISS-test-data-tts where we only kept
332 # seqNums from 950 on one day, we can no longer assume this so don't be
333 # tempted to add such a constraint back in here for speed.
334 where = "exposure.day_obs>20200101"
335 records = butler.registry.queryDimensionRecords("exposure", where=where, datasets=datasetType)
336 return sorted(set([r.day_obs for r in records]))
339def getMostRecentDataId(butler):
340 """Get the dataId for the most recent observation.
342 Parameters
343 ----------
344 butler : `lsst.daf.butler.Butler
345 The butler to query.
347 Returns
348 -------
349 dataId : `dict`
350 The dataId of the most recent exposure.
351 """
352 lastDay = getMostRecentDayObs(butler)
353 seqNum = getSeqNumsForDayObs(butler, lastDay)[-1]
354 dataId = {'day_obs': lastDay, 'seq_num': seqNum, 'detector': 0}
355 dataId.update(getExpIdFromDayObsSeqNum(butler, dataId))
356 return dataId
359def getExpIdFromDayObsSeqNum(butler, dataId):
360 """Get the exposure id for the dataId.
362 Parameters
363 ----------
364 butler : `lsst.daf.butler.Butler
365 The butler to query.
366 dataId : `dict`
367 The dataId for which to return the exposure id.
369 Returns
370 -------
371 dataId : `dict`
372 The dataId of the most recent exposure.
373 """
374 expRecord = getExpRecordFromDataId(butler, dataId)
375 return {'exposure': expRecord.id}
378def updateDataIdOrDataCord(dataId, **updateKwargs):
379 """Add key, value pairs to a dataId or data coordinate.
381 Parameters
382 ----------
383 dataId : `dict`
384 The dataId for which to return the exposure id.
385 updateKwargs : `dict`
386 The key value pairs add to the dataId or dataCoord.
388 Returns
389 -------
390 dataId : `dict`
391 The updated dataId.
393 Notes
394 -----
395 Always returns a dict, so note that if a data coordinate is supplied, a
396 dict is returned, changing the type.
397 """
398 newId = copy.copy(dataId)
399 newId = _assureDict(newId)
400 newId.update(updateKwargs)
401 return newId
404def fillDataId(butler, dataId):
405 """Given a dataId, fill it with values for all available dimensions.
407 Parameters
408 ----------
409 butler : `lsst.daf.butler.Butler`
410 The butler.
411 dataId : `dict`
412 The dataId to fill.
414 Returns
415 -------
416 dataId : `dict`
417 The filled dataId.
419 Notes
420 -----
421 This function is *slow*! Running this on 20,000 dataIds takes approximately
422 7 minutes. Virtually all the slowdown is in the
423 butler.registry.expandDataId() call though, so this wrapper is not to blame
424 here, and might speed up in future with butler improvements.
425 """
426 # ensure it's a dict to deal with records etc
427 dataId = _assureDict(dataId)
429 # this removes extraneous keys that would trip up the registry call
430 # using _rewrite_data_id is perhaps ever so slightly slower than popping
431 # the bad keys, or making a minimal dataId by hand, but is more
432 # reliable/general, so we choose that over the other approach here
433 dataId, _ = butler._rewrite_data_id(dataId, butler.registry.getDatasetType('raw'))
435 # now expand and turn back to a dict
436 dataId = butler.registry.expandDataId(dataId, detector=0).full # this call is VERY slow
437 dataId = _assureDict(dataId)
439 missingExpId = getExpId(dataId) is None
440 missingDayObs = getDayObs(dataId) is None
441 missingSeqNum = getSeqNum(dataId) is None
443 if missingDayObs or missingSeqNum:
444 dayObsSeqNum = getDayObsSeqNumFromExposureId(butler, dataId)
445 dataId.update(dayObsSeqNum)
447 if missingExpId:
448 expId = getExpIdFromDayObsSeqNum(butler, dataId)
449 dataId.update(expId)
451 return dataId
454def _assureDict(dataId):
455 """Turn any data-identifier-like object into a dict.
457 Parameters
458 ----------
459 dataId : `dict` or `lsst.daf.butler.dimensions.DataCoordinate` or
460 `lsst.daf.butler.dimensions.DimensionRecord`
461 The data identifier.
463 Returns
464 -------
465 dataId : `dict`
466 The data identifier as a dict.
467 """
468 if isinstance(dataId, dict):
469 return dataId
470 elif hasattr(dataId, 'items'): # dafButler.dimensions.DataCoordinate
471 return {str(k): v for k, v in dataId.items()} # str() required due to full names
472 elif hasattr(dataId, 'dataId'): # dafButler.dimensions.DimensionRecord
473 return {str(k): v for k, v in dataId.dataId.items()}
474 else:
475 raise RuntimeError(f'Failed to coerce {type(dataId)} to dict')
478def getExpRecordFromDataId(butler, dataId):
479 """Get the exposure record for a given dataId.
481 Parameters
482 ----------
483 butler : `lsst.daf.butler.Butler`
484 The butler.
485 dataId : `dict`
486 The dataId.
488 Returns
489 -------
490 expRecord : `lsst.daf.butler.dimensions.ExposureRecord`
491 The exposure record.
492 """
493 dataId = _assureDict(dataId)
494 assert isinstance(dataId, dict), f'dataId must be a dict or DimensionRecord, got {type(dataId)}'
496 if expId := getExpId(dataId):
497 where = "exposure.id=expId"
498 expRecords = butler.registry.queryDimensionRecords("exposure",
499 where=where,
500 bind={'expId': expId},
501 datasets='raw')
503 else:
504 dayObs = getDayObs(dataId)
505 seqNum = getSeqNum(dataId)
506 if not (dayObs and seqNum):
507 raise RuntimeError(f'Failed to find either expId or day_obs and seq_num in dataId {dataId}')
508 where = "exposure.day_obs=day_obs AND exposure.seq_num=seq_num"
509 expRecords = butler.registry.queryDimensionRecords("exposure",
510 where=where,
511 bind={'day_obs': dayObs, 'seq_num': seqNum},
512 datasets='raw')
514 expRecords = set(expRecords)
515 if not expRecords:
516 raise LookupError(f"No exposure records found for {dataId}")
517 assert len(expRecords) == 1, f'Found {len(expRecords)} exposure records for {dataId}'
518 return expRecords.pop()
521def getDayObsSeqNumFromExposureId(butler, dataId):
522 """Get the day_obs and seq_num for an exposure id.
524 Parameters
525 ----------
526 butler : `lsst.daf.butler.Butler`
527 The butler.
528 dataId : `dict`
529 The dataId containing the exposure id.
531 Returns
532 -------
533 dataId : `dict`
534 A dict containing only the day_obs and seq_num.
535 """
536 if (dayObs := getDayObs(dataId)) and (seqNum := getSeqNum(dataId)):
537 return {'day_obs': dayObs, 'seq_num': seqNum}
539 if isinstance(dataId, int):
540 dataId = {'exposure': dataId}
541 else:
542 dataId = _assureDict(dataId)
543 assert isinstance(dataId, dict)
545 if not (expId := getExpId(dataId)):
546 raise RuntimeError(f'Failed to find exposure id in {dataId}')
548 where = "exposure.id=expId"
549 expRecords = butler.registry.queryDimensionRecords("exposure",
550 where=where,
551 bind={'expId': expId},
552 datasets='raw')
553 expRecords = set(expRecords)
554 if not expRecords:
555 raise LookupError(f"No exposure records found for {dataId}")
556 assert len(expRecords) == 1, f'Found {len(expRecords)} exposure records for {dataId}'
557 record = expRecords.pop()
558 return {'day_obs': record.day_obs, 'seq_num': record.seq_num}
561def getDatasetRefForDataId(butler, datasetType, dataId):
562 """Get the datasetReference for a dataId.
564 Parameters
565 ----------
566 butler : `lsst.daf.butler.Butler`
567 The butler.
568 datasetType : `str` or `datasetType`
569 The dataset type.
570 dataId : `dict`
571 The dataId.
573 Returns
574 -------
575 datasetRef : `lsst.daf.butler.dimensions.DatasetReference`
576 The dataset reference.
577 """
578 if not _expid_present(dataId):
579 assert _dayobs_present(dataId) and _seqnum_present(dataId)
580 dataId.update(getExpIdFromDayObsSeqNum(butler, dataId))
582 dRef = butler.registry.findDataset(datasetType, dataId)
583 return dRef
586def removeDataProduct(butler, datasetType, dataId):
587 """Remove a data prodcut from the registry. Use with caution.
589 Parameters
590 ----------
591 butler : `lsst.daf.butler.Butler`
592 The butler.
593 datasetType : `str` or `datasetType`
594 The dataset type.
595 dataId : `dict`
596 The dataId.
598 """
599 if datasetType == 'raw':
600 raise RuntimeError("I'm sorry, Dave, I'm afraid I can't do that.")
601 dRef = getDatasetRefForDataId(butler, datasetType, dataId)
602 butler.pruneDatasets([dRef], disassociate=True, unstore=True, purge=True)
603 return
606def _dayobs_present(dataId):
607 return _get_dayobs_key(dataId) is not None
610def _seqnum_present(dataId):
611 return _get_seqnum_key(dataId) is not None
614def _expid_present(dataId):
615 return _get_expid_key(dataId) is not None
618def _get_dayobs_key(dataId):
619 """Return the key for day_obs if present, else None
620 """
621 keys = [k for k in dataId.keys() if k.find('day_obs') != -1]
622 if not keys:
623 return None
624 return keys[0]
627def _get_seqnum_key(dataId):
628 """Return the key for seq_num if present, else None
629 """
630 keys = [k for k in dataId.keys() if k.find('seq_num') != -1]
631 if not keys:
632 return None
633 return keys[0]
636def _get_expid_key(dataId):
637 """Return the key for expId if present, else None
638 """
639 if 'exposure.id' in dataId:
640 return 'exposure.id'
641 elif 'exposure' in dataId:
642 return 'exposure'
643 return None
646def getDayObs(dataId):
647 """Get the day_obs from a dataId.
649 Parameters
650 ----------
651 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
652 The dataId.
654 Returns
655 -------
656 day_obs : `int` or `None`
657 The day_obs value if present, else None.
658 """
659 if hasattr(dataId, 'day_obs'):
660 return getattr(dataId, 'day_obs')
661 if not _dayobs_present(dataId):
662 return None
663 return dataId['day_obs'] if 'day_obs' in dataId else dataId['exposure.day_obs']
666def getSeqNum(dataId):
667 """Get the seq_num from a dataId.
669 Parameters
670 ----------
671 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
672 The dataId.
674 Returns
675 -------
676 seq_num : `int` or `None`
677 The seq_num value if present, else None.
678 """
679 if hasattr(dataId, 'seq_num'):
680 return getattr(dataId, 'seq_num')
681 if not _seqnum_present(dataId):
682 return None
683 return dataId['seq_num'] if 'seq_num' in dataId else dataId['exposure.seq_num']
686def getExpId(dataId):
687 """Get the expId from a dataId.
689 Parameters
690 ----------
691 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
692 The dataId.
694 Returns
695 -------
696 expId : `int` or `None`
697 The expId value if present, else None.
698 """
699 if hasattr(dataId, 'id'):
700 return getattr(dataId, 'id')
701 if not _expid_present(dataId):
702 return None
703 return dataId['exposure'] if 'exposure' in dataId else dataId['exposure.id']
706def getLatissOnSkyDataIds(butler, skipTypes=('bias', 'dark', 'flat'), checkObject=True, full=True,
707 startDate=None, endDate=None):
708 """Get a list of all on-sky dataIds taken.
710 Parameters
711 ----------
712 butler : `lsst.daf.butler.Butler`
713 The butler.
714 skipTypes : `list` of `str`
715 Image types to exclude.
716 checkObject : `bool`
717 Check if the value of target_name (formerly OBJECT) is set and exlude
718 if it is not.
719 full : `bool`
720 Return filled dataIds. Required for some analyses, but runs much
721 (~30x) slower.
722 startDate : `int`
723 The day_obs to start at, inclusive.
724 endDate : `int`
725 The day_obs to end at, inclusive.
727 Returns
728 -------
729 dataIds : `list` or `dataIds`
730 The dataIds.
731 """
732 def isOnSky(expRecord):
733 imageType = expRecord.observation_type
734 obj = expRecord.target_name
735 if checkObject and obj == 'NOTSET':
736 return False
737 if imageType not in skipTypes:
738 return True
739 return False
741 recordSets = []
742 days = getDaysWithData(butler)
743 if startDate:
744 days = [d for d in days if d >= startDate]
745 if endDate:
746 days = [d for d in days if d <= endDate]
747 days = sorted(set(days))
749 where = "exposure.day_obs=day_obs"
750 for day in days:
751 # queryDataIds would be better here, but it's then hard/impossible
752 # to do the filtering for which is on sky, so just take the dataIds
753 records = butler.registry.queryDimensionRecords("exposure",
754 where=where,
755 bind={'day_obs': day},
756 datasets='raw')
757 recordSets.append(sortRecordsByDayObsThenSeqNum(records))
759 dataIds = [r.dataId for r in filter(isOnSky, itertools.chain(*recordSets))]
760 if full:
761 expandedIds = [updateDataIdOrDataCord(butler.registry.expandDataId(dataId, detector=0).full)
762 for dataId in dataIds]
763 filledIds = [fillDataId(butler, dataId) for dataId in expandedIds]
764 return filledIds
765 else:
766 return [updateDataIdOrDataCord(dataId, detector=0) for dataId in dataIds]