Coverage for python/lsst/summit/utils/butlerUtils.py: 14%
238 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-11 11:03 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-11 11:03 +0000
1# This file is part of summit_utils.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import lsst.daf.butler as dafButler
23import itertools
24import copy
25from deprecated.sphinx import deprecated
27from lsst.summit.utils.utils import getSite
30__all__ = ["makeDefaultLatissButler",
31 "updateDataId",
32 "sanitize_day_obs",
33 "getMostRecentDayObs",
34 "getSeqNumsForDayObs",
35 "getMostRecentDataId",
36 "getDatasetRefForDataId",
37 "getDayObs",
38 "getSeqNum",
39 "getExpId",
40 "datasetExists",
41 "sortRecordsByDayObsThenSeqNum",
42 "getDaysWithData",
43 "getExpIdFromDayObsSeqNum",
44 "updateDataIdOrDataCord",
45 "fillDataId",
46 "getExpRecordFromDataId",
47 "getDayObsSeqNumFromExposureId",
48 "removeDataProduct",
49 "getLatissOnSkyDataIds",
50 ]
52_LATISS_DEFAULT_COLLECTIONS = ['LATISS/raw/all', 'LATISS/calib', "LATISS/runs/quickLook"]
54# RECENT_DAY must be in the past *and have data* (otherwise some tests are
55# no-ops), to speed up queries by restricting them significantly,
56# but data must definitely been taken since. Should
57# also not be more than 2 months in the past due to 60 day lookback time on the
58# summit. All this means it should be updated by an informed human.
59RECENT_DAY = 20220503
62def _configureForSite():
63 try:
64 site = getSite()
65 except ValueError:
66 # this method is run automatically on module import, so
67 # don't fail for k8s where this cannot yet be determined
68 print("WARNING: failed to automatically determine site")
69 site = None
71 if site == 'tucson': 71 ↛ 73line 71 didn't jump to line 73, because the condition on line 71 was never true
72 global RECENT_DAY
73 RECENT_DAY = 20211104 # TTS has limited data, so use this day
76_configureForSite()
79def getLatissDefaultCollections():
80 """Get the default set of LATISS collections, updated for the site at
81 which the code is being run.
83 Returns
84 -------
85 collections : `list` of `str`
86 The default collections for the site.
87 """
88 collections = _LATISS_DEFAULT_COLLECTIONS
89 try:
90 site = getSite()
91 except ValueError:
92 site = ''
94 if site == 'tucson':
95 collections.append("LATISS-test-data")
96 return collections
97 if site == 'summit':
98 collections.append("LATISS-test-data")
99 return collections
100 return collections
103def _update_RECENT_DAY(day):
104 """Update the value for RECENT_DAY once we have a value for free."""
105 global RECENT_DAY
106 RECENT_DAY = max(day-1, RECENT_DAY)
109def makeDefaultLatissButler(*, extraCollections=None, writeable=False, embargo=False):
110 """Create a butler for LATISS using the default collections.
112 Parameters
113 ----------
114 extraCollections : `list` of `str`
115 Extra input collections to supply to the butler init.
116 writable : `bool`, optional
117 Whether to make a writable butler.
118 embargo : `bool`, optional
119 Use the embargo repo instead of the main one. Needed to access
120 embargoed data.
122 Returns
123 -------
124 butler : `lsst.daf.butler.Butler`
125 The butler.
126 """
127 # TODO: Add logging to which collections are going in
128 collections = getLatissDefaultCollections()
129 if extraCollections:
130 collections.extend(extraCollections)
131 try:
132 repoString = "LATISS" if not embargo else "/repo/embargo"
133 butler = dafButler.Butler(repoString,
134 collections=collections,
135 writeable=writeable,
136 instrument='LATISS')
137 except(FileNotFoundError, RuntimeError):
138 # Depending on the value of DAF_BUTLER_REPOSITORY_INDEX and whether
139 # it is present and blank, or just not set, both these exception
140 # types can be raised, see tests/test_butlerUtils.py:ButlerInitTestCase
141 # for details and tests which confirm these have not changed
142 raise FileNotFoundError # unify exception type
143 return butler
146@deprecated(
147 reason="datasExists has been replaced by Butler.exists(). Will be removed after v26.0.",
148 version="v26.0",
149 category=FutureWarning,
150)
151def datasetExists(butler, dataProduct, dataId, **kwargs):
152 """Collapse the tri-state behaviour of butler.datasetExists to a boolean.
154 Parameters
155 ----------
156 butler : `lsst.daf.butler.Butler`
157 The butler
158 dataProduct : `str`
159 The type of data product to check for
160 dataId : `dict`
161 The dataId of the dataProduct to check for
163 Returns
164 -------
165 exists : `bool`
166 True if the dataProduct exists for the dataProduct and can be retreived
167 else False.
168 """
169 return butler.exists(dataProduct, dataId, **kwargs)
172def updateDataId(dataId, **kwargs):
173 """Update a DataCoordinate or dataId dict with kwargs.
175 Provides a single interface for adding the detector key (or others) to a
176 dataId whether it's a DataCoordinate or a dict
178 Parameters
179 ----------
180 dataId : `dict` or `lsst.daf.butler.DataCoordinate`
181 The dataId to update.
182 kwargs : `dict`
183 The keys and values to add to the dataId.
185 Returns
186 -------
187 dataId : `dict` or `lsst.daf.butler.DataCoordinate`
188 The updated dataId, with the same type as the input.
189 """
191 match dataId:
192 case dafButler.DataCoordinate():
193 return dafButler.DataCoordinate.standardize(dataId, **kwargs)
194 case dict() as dataId:
195 return dict(dataId, **kwargs)
196 raise ValueError(f"Unknown dataId type {type(dataId)}")
199def sanitize_day_obs(day_obs):
200 """Take string or int day_obs and turn it into the int version.
202 Parameters
203 ----------
204 day_obs : `str` or `int`
205 The day_obs to sanitize.
207 Returns
208 -------
209 day_obs : `int`
210 The sanitized day_obs.
212 Raises
213 ------
214 ValueError
215 Raised if the day_obs fails to translate for any reason.
216 """
217 if isinstance(day_obs, int):
218 return day_obs
219 elif isinstance(day_obs, str):
220 try:
221 return int(day_obs.replace('-', ''))
222 except Exception:
223 ValueError(f'Failed to sanitize {day_obs!r} to a day_obs')
224 else:
225 raise ValueError(f'Cannot sanitize {day_obs!r} to a day_obs')
228def getMostRecentDayObs(butler):
229 """Get the most recent day_obs for which there is data.
231 Parameters
232 ----------
233 butler : `lsst.daf.butler.Butler
234 The butler to query.
236 Returns
237 -------
238 day_obs : `int`
239 The day_obs.
240 """
241 where = "exposure.day_obs>=RECENT_DAY"
242 records = butler.registry.queryDimensionRecords('exposure', where=where, datasets='raw',
243 bind={'RECENT_DAY': RECENT_DAY})
244 recentDay = max(r.day_obs for r in records)
245 _update_RECENT_DAY(recentDay)
246 return recentDay
249def getSeqNumsForDayObs(butler, day_obs, extraWhere=''):
250 """Get a list of all seq_nums taken on a given day_obs.
252 Parameters
253 ----------
254 butler : `lsst.daf.butler.Butler
255 The butler to query.
256 day_obs : `int` or `str`
257 The day_obs for which the seq_nums are desired.
258 extraWhere : `str`
259 Any extra where conditions to add to the queryDimensionRecords call.
261 Returns
262 -------
263 seq_nums : `iterable`
264 The seq_nums taken on the corresponding day_obs in ascending numerical
265 order.
266 """
267 day_obs = sanitize_day_obs(day_obs)
268 where = "exposure.day_obs=day_obs"
269 if extraWhere:
270 extraWhere = extraWhere.replace('"', '\'')
271 where += f" and {extraWhere}"
272 records = butler.registry.queryDimensionRecords("exposure",
273 where=where,
274 bind={'day_obs': day_obs},
275 datasets='raw')
276 return sorted([r.seq_num for r in records])
279def sortRecordsByDayObsThenSeqNum(records):
280 """Sort a set of records by dayObs, then seqNum to get the order in which
281 they were taken.
283 Parameters
284 ----------
285 records : `list` of `dict`
286 The records to be sorted.
288 Returns
289 -------
290 sortedRecords : `list` of `dict`
291 The sorted records
293 Raises
294 ------
295 ValueError
296 Raised if the recordSet contains duplicate records, or if it contains
297 (dayObs, seqNum) collisions.
298 """
299 records = list(records) # must call list in case we have a generator
300 recordSet = set(records)
301 if len(records) != len(recordSet):
302 raise ValueError("Record set contains duplicate records and therefore cannot be sorted unambiguously")
304 daySeqTuples = [(r.day_obs, r.seq_num) for r in records]
305 if len(daySeqTuples) != len(set(daySeqTuples)):
306 raise ValueError("Record set contains dayObs/seqNum collisions, and therefore cannot be sorted "
307 "unambiguously")
309 records.sort(key=lambda r: (r.day_obs, r.seq_num))
310 return records
313def getDaysWithData(butler, datasetType='raw'):
314 """Get all the days for which LATISS has taken data on the mountain.
316 Parameters
317 ----------
318 butler : `lsst.daf.butler.Butler
319 The butler to query.
320 datasetType : `str`
321 The datasetType to query.
323 Returns
324 -------
325 days : `list` of `int`
326 A sorted list of the day_obs values for which mountain-top data exists.
327 """
328 # 20200101 is a day between shipping LATISS and going on sky
329 # We used to constrain on exposure.seq_num<50 to massively reduce the
330 # number of returned records whilst being large enough to ensure that no
331 # days are missed because early seq_nums were skipped. However, because
332 # we have test datasets like LATISS-test-data-tts where we only kept
333 # seqNums from 950 on one day, we can no longer assume this so don't be
334 # tempted to add such a constraint back in here for speed.
335 where = "exposure.day_obs>20200101"
336 records = butler.registry.queryDimensionRecords("exposure", where=where, datasets=datasetType)
337 return sorted(set([r.day_obs for r in records]))
340def getMostRecentDataId(butler):
341 """Get the dataId for the most recent observation.
343 Parameters
344 ----------
345 butler : `lsst.daf.butler.Butler
346 The butler to query.
348 Returns
349 -------
350 dataId : `dict`
351 The dataId of the most recent exposure.
352 """
353 lastDay = getMostRecentDayObs(butler)
354 seqNum = getSeqNumsForDayObs(butler, lastDay)[-1]
355 dataId = {'day_obs': lastDay, 'seq_num': seqNum, 'detector': 0}
356 dataId.update(getExpIdFromDayObsSeqNum(butler, dataId))
357 return dataId
360def getExpIdFromDayObsSeqNum(butler, dataId):
361 """Get the exposure id for the dataId.
363 Parameters
364 ----------
365 butler : `lsst.daf.butler.Butler
366 The butler to query.
367 dataId : `dict`
368 The dataId for which to return the exposure id.
370 Returns
371 -------
372 dataId : `dict`
373 The dataId of the most recent exposure.
374 """
375 expRecord = getExpRecordFromDataId(butler, dataId)
376 return {'exposure': expRecord.id}
379def updateDataIdOrDataCord(dataId, **updateKwargs):
380 """Add key, value pairs to a dataId or data coordinate.
382 Parameters
383 ----------
384 dataId : `dict`
385 The dataId for which to return the exposure id.
386 updateKwargs : `dict`
387 The key value pairs add to the dataId or dataCoord.
389 Returns
390 -------
391 dataId : `dict`
392 The updated dataId.
394 Notes
395 -----
396 Always returns a dict, so note that if a data coordinate is supplied, a
397 dict is returned, changing the type.
398 """
399 newId = copy.copy(dataId)
400 newId = _assureDict(newId)
401 newId.update(updateKwargs)
402 return newId
405def fillDataId(butler, dataId):
406 """Given a dataId, fill it with values for all available dimensions.
408 Parameters
409 ----------
410 butler : `lsst.daf.butler.Butler`
411 The butler.
412 dataId : `dict`
413 The dataId to fill.
415 Returns
416 -------
417 dataId : `dict`
418 The filled dataId.
420 Notes
421 -----
422 This function is *slow*! Running this on 20,000 dataIds takes approximately
423 7 minutes. Virtually all the slowdown is in the
424 butler.registry.expandDataId() call though, so this wrapper is not to blame
425 here, and might speed up in future with butler improvements.
426 """
427 # ensure it's a dict to deal with records etc
428 dataId = _assureDict(dataId)
430 # this removes extraneous keys that would trip up the registry call
431 # using _rewrite_data_id is perhaps ever so slightly slower than popping
432 # the bad keys, or making a minimal dataId by hand, but is more
433 # reliable/general, so we choose that over the other approach here
434 dataId, _ = butler._rewrite_data_id(dataId, butler.registry.getDatasetType('raw'))
436 # now expand and turn back to a dict
437 dataId = butler.registry.expandDataId(dataId, detector=0).full # this call is VERY slow
438 dataId = _assureDict(dataId)
440 missingExpId = getExpId(dataId) is None
441 missingDayObs = getDayObs(dataId) is None
442 missingSeqNum = getSeqNum(dataId) is None
444 if missingDayObs or missingSeqNum:
445 dayObsSeqNum = getDayObsSeqNumFromExposureId(butler, dataId)
446 dataId.update(dayObsSeqNum)
448 if missingExpId:
449 expId = getExpIdFromDayObsSeqNum(butler, dataId)
450 dataId.update(expId)
452 return dataId
455def _assureDict(dataId):
456 """Turn any data-identifier-like object into a dict.
458 Parameters
459 ----------
460 dataId : `dict` or `lsst.daf.butler.dimensions.DataCoordinate` or
461 `lsst.daf.butler.dimensions.DimensionRecord`
462 The data identifier.
464 Returns
465 -------
466 dataId : `dict`
467 The data identifier as a dict.
468 """
469 if isinstance(dataId, dict):
470 return dataId
471 elif hasattr(dataId, 'items'): # dafButler.dimensions.DataCoordinate
472 return {str(k): v for k, v in dataId.items()} # str() required due to full names
473 elif hasattr(dataId, 'dataId'): # dafButler.dimensions.DimensionRecord
474 return {str(k): v for k, v in dataId.dataId.items()}
475 else:
476 raise RuntimeError(f'Failed to coerce {type(dataId)} to dict')
479def getExpRecordFromDataId(butler, dataId):
480 """Get the exposure record for a given dataId.
482 Parameters
483 ----------
484 butler : `lsst.daf.butler.Butler`
485 The butler.
486 dataId : `dict`
487 The dataId.
489 Returns
490 -------
491 expRecord : `lsst.daf.butler.dimensions.ExposureRecord`
492 The exposure record.
493 """
494 dataId = _assureDict(dataId)
495 assert isinstance(dataId, dict), f'dataId must be a dict or DimensionRecord, got {type(dataId)}'
497 if expId := getExpId(dataId):
498 where = "exposure.id=expId"
499 expRecords = butler.registry.queryDimensionRecords("exposure",
500 where=where,
501 bind={'expId': expId},
502 datasets='raw')
504 else:
505 dayObs = getDayObs(dataId)
506 seqNum = getSeqNum(dataId)
507 if not (dayObs and seqNum):
508 raise RuntimeError(f'Failed to find either expId or day_obs and seq_num in dataId {dataId}')
509 where = "exposure.day_obs=day_obs AND exposure.seq_num=seq_num"
510 expRecords = butler.registry.queryDimensionRecords("exposure",
511 where=where,
512 bind={'day_obs': dayObs, 'seq_num': seqNum},
513 datasets='raw')
515 expRecords = set(expRecords)
516 if not expRecords:
517 raise LookupError(f"No exposure records found for {dataId}")
518 assert len(expRecords) == 1, f'Found {len(expRecords)} exposure records for {dataId}'
519 return expRecords.pop()
522def getDayObsSeqNumFromExposureId(butler, dataId):
523 """Get the day_obs and seq_num for an exposure id.
525 Parameters
526 ----------
527 butler : `lsst.daf.butler.Butler`
528 The butler.
529 dataId : `dict`
530 The dataId containing the exposure id.
532 Returns
533 -------
534 dataId : `dict`
535 A dict containing only the day_obs and seq_num.
536 """
537 if (dayObs := getDayObs(dataId)) and (seqNum := getSeqNum(dataId)):
538 return {'day_obs': dayObs, 'seq_num': seqNum}
540 if isinstance(dataId, int):
541 dataId = {'exposure': dataId}
542 else:
543 dataId = _assureDict(dataId)
544 assert isinstance(dataId, dict)
546 if not (expId := getExpId(dataId)):
547 raise RuntimeError(f'Failed to find exposure id in {dataId}')
549 where = "exposure.id=expId"
550 expRecords = butler.registry.queryDimensionRecords("exposure",
551 where=where,
552 bind={'expId': expId},
553 datasets='raw')
554 expRecords = set(expRecords)
555 if not expRecords:
556 raise LookupError(f"No exposure records found for {dataId}")
557 assert len(expRecords) == 1, f'Found {len(expRecords)} exposure records for {dataId}'
558 record = expRecords.pop()
559 return {'day_obs': record.day_obs, 'seq_num': record.seq_num}
562def getDatasetRefForDataId(butler, datasetType, dataId):
563 """Get the datasetReference for a dataId.
565 Parameters
566 ----------
567 butler : `lsst.daf.butler.Butler`
568 The butler.
569 datasetType : `str` or `datasetType`
570 The dataset type.
571 dataId : `dict`
572 The dataId.
574 Returns
575 -------
576 datasetRef : `lsst.daf.butler.dimensions.DatasetReference`
577 The dataset reference.
578 """
579 if not _expid_present(dataId):
580 assert _dayobs_present(dataId) and _seqnum_present(dataId)
581 dataId.update(getExpIdFromDayObsSeqNum(butler, dataId))
583 dRef = butler.registry.findDataset(datasetType, dataId)
584 return dRef
587def removeDataProduct(butler, datasetType, dataId):
588 """Remove a data prodcut from the registry. Use with caution.
590 Parameters
591 ----------
592 butler : `lsst.daf.butler.Butler`
593 The butler.
594 datasetType : `str` or `datasetType`
595 The dataset type.
596 dataId : `dict`
597 The dataId.
599 """
600 if datasetType == 'raw':
601 raise RuntimeError("I'm sorry, Dave, I'm afraid I can't do that.")
602 dRef = getDatasetRefForDataId(butler, datasetType, dataId)
603 butler.pruneDatasets([dRef], disassociate=True, unstore=True, purge=True)
604 return
607def _dayobs_present(dataId):
608 return _get_dayobs_key(dataId) is not None
611def _seqnum_present(dataId):
612 return _get_seqnum_key(dataId) is not None
615def _expid_present(dataId):
616 return _get_expid_key(dataId) is not None
619def _get_dayobs_key(dataId):
620 """Return the key for day_obs if present, else None
621 """
622 keys = [k for k in dataId.keys() if k.find('day_obs') != -1]
623 if not keys:
624 return None
625 return keys[0]
628def _get_seqnum_key(dataId):
629 """Return the key for seq_num if present, else None
630 """
631 keys = [k for k in dataId.keys() if k.find('seq_num') != -1]
632 if not keys:
633 return None
634 return keys[0]
637def _get_expid_key(dataId):
638 """Return the key for expId if present, else None
639 """
640 if 'exposure.id' in dataId:
641 return 'exposure.id'
642 elif 'exposure' in dataId:
643 return 'exposure'
644 return None
647def getDayObs(dataId):
648 """Get the day_obs from a dataId.
650 Parameters
651 ----------
652 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
653 The dataId.
655 Returns
656 -------
657 day_obs : `int` or `None`
658 The day_obs value if present, else None.
659 """
660 if hasattr(dataId, 'day_obs'):
661 return getattr(dataId, 'day_obs')
662 if not _dayobs_present(dataId):
663 return None
664 return dataId['day_obs'] if 'day_obs' in dataId else dataId['exposure.day_obs']
667def getSeqNum(dataId):
668 """Get the seq_num from a dataId.
670 Parameters
671 ----------
672 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
673 The dataId.
675 Returns
676 -------
677 seq_num : `int` or `None`
678 The seq_num value if present, else None.
679 """
680 if hasattr(dataId, 'seq_num'):
681 return getattr(dataId, 'seq_num')
682 if not _seqnum_present(dataId):
683 return None
684 return dataId['seq_num'] if 'seq_num' in dataId else dataId['exposure.seq_num']
687def getExpId(dataId):
688 """Get the expId from a dataId.
690 Parameters
691 ----------
692 dataId : `dict` or `lsst.daf.butler.DimensionRecord`
693 The dataId.
695 Returns
696 -------
697 expId : `int` or `None`
698 The expId value if present, else None.
699 """
700 if hasattr(dataId, 'id'):
701 return getattr(dataId, 'id')
702 if not _expid_present(dataId):
703 return None
704 return dataId['exposure'] if 'exposure' in dataId else dataId['exposure.id']
707def getLatissOnSkyDataIds(butler, skipTypes=('bias', 'dark', 'flat'), checkObject=True, full=True,
708 startDate=None, endDate=None):
709 """Get a list of all on-sky dataIds taken.
711 Parameters
712 ----------
713 butler : `lsst.daf.butler.Butler`
714 The butler.
715 skipTypes : `list` of `str`
716 Image types to exclude.
717 checkObject : `bool`
718 Check if the value of target_name (formerly OBJECT) is set and exlude
719 if it is not.
720 full : `bool`
721 Return filled dataIds. Required for some analyses, but runs much
722 (~30x) slower.
723 startDate : `int`
724 The day_obs to start at, inclusive.
725 endDate : `int`
726 The day_obs to end at, inclusive.
728 Returns
729 -------
730 dataIds : `list` or `dataIds`
731 The dataIds.
732 """
733 def isOnSky(expRecord):
734 imageType = expRecord.observation_type
735 obj = expRecord.target_name
736 if checkObject and obj == 'NOTSET':
737 return False
738 if imageType not in skipTypes:
739 return True
740 return False
742 recordSets = []
743 days = getDaysWithData(butler)
744 if startDate:
745 days = [d for d in days if d >= startDate]
746 if endDate:
747 days = [d for d in days if d <= endDate]
748 days = sorted(set(days))
750 where = "exposure.day_obs=day_obs"
751 for day in days:
752 # queryDataIds would be better here, but it's then hard/impossible
753 # to do the filtering for which is on sky, so just take the dataIds
754 records = butler.registry.queryDimensionRecords("exposure",
755 where=where,
756 bind={'day_obs': day},
757 datasets='raw')
758 recordSets.append(sortRecordsByDayObsThenSeqNum(records))
760 dataIds = [r.dataId for r in filter(isOnSky, itertools.chain(*recordSets))]
761 if full:
762 expandedIds = [updateDataIdOrDataCord(butler.registry.expandDataId(dataId, detector=0).full)
763 for dataId in dataIds]
764 filledIds = [fillDataId(butler, dataId) for dataId in expandedIds]
765 return filledIds
766 else:
767 return [updateDataIdOrDataCord(dataId, detector=0) for dataId in dataIds]