Coverage for python/lsst/summit/utils/butlerUtils.py: 13%
227 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-18 02:50 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2022-10-18 02:50 -0700
1# This file is part of summit_utils.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
22import lsst.daf.butler as dafButler
23import itertools
24import copy
26from lsst.summit.utils.utils import getSite
29__all__ = ["makeDefaultLatissButler",
30 "sanitize_day_obs",
31 "getMostRecentDayObs",
32 "getSeqNumsForDayObs",
33 "getMostRecentDataId",
34 "getDatasetRefForDataId",
35 "getDayObs",
36 "getSeqNum",
37 "getExpId",
38 "datasetExists",
39 "sortRecordsByDayObsThenSeqNum",
40 "getDaysWithData",
41 "getExpIdFromDayObsSeqNum",
42 "updateDataIdOrDataCord",
43 "fillDataId",
44 "getExpRecordFromDataId",
45 "getDayObsSeqNumFromExposureId",
46 "removeDataProduct",
47 "getLatissOnSkyDataIds",
48 ]
50_LATISS_DEFAULT_COLLECTIONS = ['LATISS/raw/all', 'LATISS/calib', "LATISS/runs/quickLook"]
52# RECENT_DAY must be in the past *and have data* (otherwise some tests are
53# no-ops), to speed up queries by restricting them significantly,
54# but data must definitely been taken since. Should
55# also not be more than 2 months in the past due to 60 day lookback time on the
56# summit. All this means it should be updated by an informed human.
57RECENT_DAY = 20220503
60def _configureForSite():
61 try:
62 site = getSite()
63 except ValueError:
64 # this method is run automatically on module import, so
65 # don't fail for k8s where this cannot yet be determined
66 print("WARNING: failed to automatically determine site")
67 site = None
69 if site == 'tucson': 69 ↛ 71line 69 didn't jump to line 71, because the condition on line 69 was never true
70 global RECENT_DAY
71 RECENT_DAY = 20211104 # TTS has limited data, so use this day
74_configureForSite()
77def getLatissDefaultCollections():
78 """Get the default set of LATISS collections, updated for the site at
79 which the code is being run.
81 Returns
82 -------
83 collections : `list` of `str`
84 The default collections for the site.
85 """
86 collections = _LATISS_DEFAULT_COLLECTIONS
87 try:
88 site = getSite()
89 except ValueError:
90 site = ''
92 if site == 'tucson':
93 collections.append("LATISS-test-data-tts")
94 return collections
95 if site == 'summit':
96 collections.append("LATISS_test_data")
97 return collections
98 return collections
101def _update_RECENT_DAY(day):
102 """Update the value for RECENT_DAY once we have a value for free."""
103 global RECENT_DAY
104 RECENT_DAY = max(day-1, RECENT_DAY)
107def makeDefaultLatissButler(*, extraCollections=None, writeable=False, oga=False):
108 """Create a butler for LATISS using the default collections.
110 Parameters
111 ----------
112 extraCollections : `list` of `str`
113 Extra input collections to supply to the butler init.
114 writable : `bool`, optional
115 Whether to make a writable butler.
116 oga : `bool`, optional
117 Use the OGA repo instead of the main one. Needed to access embargoed
118 data.
120 Returns
121 -------
122 butler : `lsst.daf.butler.Butler`
123 The butler.
124 """
125 # TODO: Add logging to which collections are going in
126 collections = getLatissDefaultCollections()
127 if extraCollections:
128 collections.extend(extraCollections)
129 try:
130 repoString = "LATISS" if not oga else "/repo/oga"
131 butler = dafButler.Butler(repoString,
132 collections=collections,
133 writeable=writeable,
134 instrument='LATISS')
135 except(FileNotFoundError, RuntimeError):
136 # Depending on the value of DAF_BUTLER_REPOSITORY_INDEX and whether
137 # it is present and blank, or just not set, both these exception
138 # types can be raised, see tests/test_butlerUtils.py:ButlerInitTestCase
139 # for details and tests which confirm these have not changed
140 raise FileNotFoundError # unify exception type
141 return butler
144# TODO: DM-32940 can remove this whole function once this ticket merges.
145def datasetExists(butler, dataProduct, dataId, **kwargs):
146 """Collapse the tri-state behaviour of butler.datasetExists to a boolean.
148 Parameters
149 ----------
150 butler : `lsst.daf.butler.Butler`
151 The butler
152 dataProduct : `str`
153 The type of data product to check for
154 dataId : `dict`
155 The dataId of the dataProduct to check for
157 Returns
158 -------
159 exists : `bool`
160 True if the dataProduct exists for the dataProduct and can be retreived
161 else False.
162 """
163 try:
164 exists = butler.datasetExists(dataProduct, dataId, **kwargs)
165 return exists
166 except (LookupError, RuntimeError):
167 return False
170def sanitize_day_obs(day_obs):
171 """Take string or int day_obs and turn it into the int version.
173 Parameters
174 ----------
175 day_obs : `str` or `int`
176 The day_obs to sanitize.
178 Returns
179 -------
180 day_obs : `int`
181 The sanitized day_obs.
183 Raises
184 ------
185 ValueError
186 Raised if the day_obs fails to translate for any reason.
187 """
188 if isinstance(day_obs, int):
189 return day_obs
190 elif isinstance(day_obs, str):
191 try:
192 return int(day_obs.replace('-', ''))
193 except Exception:
194 ValueError(f'Failed to sanitize {day_obs!r} to a day_obs')
195 else:
196 raise ValueError(f'Cannot sanitize {day_obs!r} to a day_obs')
199def getMostRecentDayObs(butler):
200 """Get the most recent day_obs for which there is data.
202 Parameters
203 ----------
204 butler : `lsst.daf.butler.Butler
205 The butler to query.
207 Returns
208 -------
209 day_obs : `int`
210 The day_obs.
211 """
212 where = "exposure.day_obs>=RECENT_DAY"
213 records = butler.registry.queryDimensionRecords('exposure', where=where, datasets='raw',
214 bind={'RECENT_DAY': RECENT_DAY})
215 recentDay = max(r.day_obs for r in records)
216 _update_RECENT_DAY(recentDay)
217 return recentDay
220def getSeqNumsForDayObs(butler, day_obs, extraWhere=''):
221 """Get a list of all seq_nums taken on a given day_obs.
223 Parameters
224 ----------
225 butler : `lsst.daf.butler.Butler
226 The butler to query.
227 day_obs : `int` or `str`
228 The day_obs for which the seq_nums are desired.
229 extraWhere : `str`
230 Any extra where conditions to add to the queryDimensionRecords call.
232 Returns
233 -------
234 seq_nums : `iterable`
235 The seq_nums taken on the corresponding day_obs in ascending numerical
236 order.
237 """
238 day_obs = sanitize_day_obs(day_obs)
239 where = "exposure.day_obs=day_obs"
240 if extraWhere:
241 extraWhere = extraWhere.replace('"', '\'')
242 where += f" and {extraWhere}"
243 records = butler.registry.queryDimensionRecords("exposure",
244 where=where,
245 bind={'day_obs': day_obs},
246 datasets='raw')
247 return sorted([r.seq_num for r in records])
250def sortRecordsByDayObsThenSeqNum(records):
251 """Sort a set of records by dayObs, then seqNum to get the order in which
252 they were taken.
254 Parameters
255 ----------
256 records : `list` of `dict`
257 The records to be sorted.
259 Returns
260 -------
261 sortedRecords : `list` of `dict`
262 The sorted records
264 Raises
265 ------
266 ValueError
267 Raised if the recordSet contains duplicate records, or if it contains
268 (dayObs, seqNum) collisions.
269 """
270 records = list(records) # must call list in case we have a generator
271 recordSet = set(records)
272 if len(records) != len(recordSet):
273 raise ValueError("Record set contains duplicate records and therefore cannot be sorted unambiguously")
275 daySeqTuples = [(r.day_obs, r.seq_num) for r in records]
276 if len(daySeqTuples) != len(set(daySeqTuples)):
277 raise ValueError("Record set contains dayObs/seqNum collisions, and therefore cannot be sorted "
278 "unambiguously")
280 records.sort(key=lambda r: (r.day_obs, r.seq_num))
281 return records
284def getDaysWithData(butler):
285 """Get all the days for which LATISS has taken data on the mountain.
287 Parameters
288 ----------
289 butler : `lsst.daf.butler.Butler
290 The butler to query.
292 Returns
293 -------
294 days : `list` of `int`
295 A sorted list of the day_obs values for which mountain-top data exists.
296 """
297 # 20200101 is a day between shipping LATISS and going on sky
298 # We used to constrain on exposure.seq_num<50 to massively reduce the
299 # number of returned records whilst being large enough to ensure that no
300 # days are missed because early seq_nums were skipped. However, because
301 # we have test datasets like LATISS-test-data-tts where we only kept
302 # seqNums from 950 on one day, we can no longer assume this so don't be
303 # tempted to add such a constraint back in here for speed.
304 where = "exposure.day_obs>20200101"
305 records = butler.registry.queryDimensionRecords("exposure", where=where, datasets='raw')
306 return sorted(set([r.day_obs for r in records]))
309def getMostRecentDataId(butler):
310 """Get the dataId for the most recent observation.
312 Parameters
313 ----------
314 butler : `lsst.daf.butler.Butler
315 The butler to query.
317 Returns
318 -------
319 dataId : `dict`
320 The dataId of the most recent exposure.
321 """
322 lastDay = getMostRecentDayObs(butler)
323 seqNum = getSeqNumsForDayObs(butler, lastDay)[-1]
324 dataId = {'day_obs': lastDay, 'seq_num': seqNum, 'detector': 0}
325 dataId.update(getExpIdFromDayObsSeqNum(butler, dataId))
326 return dataId
329def getExpIdFromDayObsSeqNum(butler, dataId):
330 """Get the exposure id for the dataId.
332 Parameters
333 ----------
334 butler : `lsst.daf.butler.Butler
335 The butler to query.
336 dataId : `dict`
337 The dataId for which to return the exposure id.
339 Returns
340 -------
341 dataId : `dict`
342 The dataId of the most recent exposure.
343 """
344 expRecord = getExpRecordFromDataId(butler, dataId)
345 return {'exposure': expRecord.id}
348def updateDataIdOrDataCord(dataId, **updateKwargs):
349 """Add key, value pairs to a dataId or data coordinate.
351 Parameters
352 ----------
353 dataId : `dict`
354 The dataId for which to return the exposure id.
355 updateKwargs : `dict`
356 The key value pairs add to the dataId or dataCoord.
358 Returns
359 -------
360 dataId : `dict`
361 The updated dataId.
363 Notes
364 -----
365 Always returns a dict, so note that if a data coordinate is supplied, a
366 dict is returned, changing the type.
367 """
368 newId = copy.copy(dataId)
369 newId = _assureDict(newId)
370 newId.update(updateKwargs)
371 return newId
374def fillDataId(butler, dataId):
375 """Given a dataId, fill it with values for all available dimensions.
377 Parameters
378 ----------
379 butler : `lsst.daf.butler.Butler`
380 The butler.
381 dataId : `dict`
382 The dataId to fill.
384 Returns
385 -------
386 dataId : `dict`
387 The filled dataId.
389 Notes
390 -----
391 This function is *slow*! Running this on 20,000 dataIds takes approximately
392 7 minutes. Virtually all the slowdown is in the
393 butler.registry.expandDataId() call though, so this wrapper is not to blame
394 here, and might speed up in future with butler improvements.
395 """
396 # ensure it's a dict to deal with records etc
397 dataId = _assureDict(dataId)
399 # this removes extraneous keys that would trip up the registry call
400 # using _rewrite_data_id is perhaps ever so slightly slower than popping
401 # the bad keys, or making a minimal dataId by hand, but is more
402 # reliable/general, so we choose that over the other approach here
403 dataId, _ = butler._rewrite_data_id(dataId, butler.registry.getDatasetType('raw'))
405 # now expand and turn back to a dict
406 dataId = butler.registry.expandDataId(dataId, detector=0).full # this call is VERY slow
407 dataId = _assureDict(dataId)
409 missingExpId = getExpId(dataId) is None
410 missingDayObs = getDayObs(dataId) is None
411 missingSeqNum = getSeqNum(dataId) is None
413 if missingDayObs or missingSeqNum:
414 dayObsSeqNum = getDayObsSeqNumFromExposureId(butler, dataId)
415 dataId.update(dayObsSeqNum)
417 if missingExpId:
418 expId = getExpIdFromDayObsSeqNum(butler, dataId)
419 dataId.update(expId)
421 return dataId
424def _assureDict(dataId):
425 """Turn any data-identifier-like object into a dict.
427 Parameters
428 ----------
429 dataId : `dict` or `lsst.daf.butler.dimensions.DataCoordinate` or
430 `lsst.daf.butler.dimensions.DimensionRecord`
431 The data identifier.
433 Returns
434 -------
435 dataId : `dict`
436 The data identifier as a dict.
437 """
438 if isinstance(dataId, dict):
439 return dataId
440 elif hasattr(dataId, 'items'): # dafButler.dimensions.DataCoordinate
441 return {str(k): v for k, v in dataId.items()} # str() required due to full names
442 elif hasattr(dataId, 'dataId'): # dafButler.dimensions.DimensionRecord
443 return {str(k): v for k, v in dataId.dataId.items()}
444 else:
445 raise RuntimeError(f'Failed to coerce {type(dataId)} to dict')
448def getExpRecordFromDataId(butler, dataId):
449 """Get the exposure record for a given dataId.
451 Parameters
452 ----------
453 butler : `lsst.daf.butler.Butler`
454 The butler.
455 dataId : `dict`
456 The dataId.
458 Returns
459 -------
460 expRecord : `lsst.daf.butler.dimensions.ExposureRecord`
461 The exposure record.
462 """
463 dataId = _assureDict(dataId)
464 assert isinstance(dataId, dict), f'dataId must be a dict or DimensionRecord, got {type(dataId)}'
466 if expId := getExpId(dataId):
467 where = "exposure.id=expId"
468 expRecords = butler.registry.queryDimensionRecords("exposure",
469 where=where,
470 bind={'expId': expId},
471 datasets='raw')
473 else:
474 dayObs = getDayObs(dataId)
475 seqNum = getSeqNum(dataId)
476 if not (dayObs and seqNum):
477 raise RuntimeError(f'Failed to find either expId or day_obs and seq_num in dataId {dataId}')
478 where = "exposure.day_obs=day_obs AND exposure.seq_num=seq_num"
479 expRecords = butler.registry.queryDimensionRecords("exposure",
480 where=where,
481 bind={'day_obs': dayObs, 'seq_num': seqNum},
482 datasets='raw')
484 expRecords = set(expRecords)
485 if not expRecords:
486 raise LookupError(f"No exposure records found for {dataId}")
487 assert len(expRecords) == 1, f'Found {len(expRecords)} exposure records for {dataId}'
488 return expRecords.pop()
491def getDayObsSeqNumFromExposureId(butler, dataId):
492 """Get the day_obs and seq_num for an exposure id.
494 Parameters
495 ----------
496 butler : `lsst.daf.butler.Butler`
497 The butler.
498 dataId : `dict`
499 The dataId containing the exposure id.
501 Returns
502 -------
503 dataId : `dict`
504 A dict containing only the day_obs and seq_num.
505 """
506 if (dayObs := getDayObs(dataId)) and (seqNum := getSeqNum(dataId)):
507 return {'day_obs': dayObs, 'seq_num': seqNum}
509 if isinstance(dataId, int):
510 dataId = {'exposure': dataId}
511 else:
512 dataId = _assureDict(dataId)
513 assert isinstance(dataId, dict)
515 if not (expId := getExpId(dataId)):
516 raise RuntimeError(f'Failed to find exposure id in {dataId}')
518 where = "exposure.id=expId"
519 expRecords = butler.registry.queryDimensionRecords("exposure",
520 where=where,
521 bind={'expId': expId},
522 datasets='raw')
523 expRecords = set(expRecords)
524 if not expRecords:
525 raise LookupError(f"No exposure records found for {dataId}")
526 assert len(expRecords) == 1, f'Found {len(expRecords)} exposure records for {dataId}'
527 record = expRecords.pop()
528 return {'day_obs': record.day_obs, 'seq_num': record.seq_num}
531def getDatasetRefForDataId(butler, datasetType, dataId):
532 """Get the datasetReference for a dataId.
534 Parameters
535 ----------
536 butler : `lsst.daf.butler.Butler`
537 The butler.
538 datasetType : `str` or `datasetType`
539 The dataset type.
540 dataId : `dict`
541 The dataId.
543 Returns
544 -------
545 datasetRef : `lsst.daf.butler.dimensions.DatasetReference`
546 The dataset reference.
547 """
548 if not _expid_present(dataId):
549 assert _dayobs_present(dataId) and _seqnum_present(dataId)
550 dataId.update(getExpIdFromDayObsSeqNum(butler, dataId))
552 dRef = butler.registry.findDataset(datasetType, dataId)
553 return dRef
556def removeDataProduct(butler, datasetType, dataId):
557 """Remove a data prodcut from the registry. Use with caution.
559 Parameters
560 ----------
561 butler : `lsst.daf.butler.Butler`
562 The butler.
563 datasetType : `str` or `datasetType`
564 The dataset type.
565 dataId : `dict`
566 The dataId.
568 """
569 if datasetType == 'raw':
570 raise RuntimeError("I'm sorry, Dave, I'm afraid I can't do that.")
571 dRef = getDatasetRefForDataId(butler, datasetType, dataId)
572 butler.pruneDatasets([dRef], disassociate=True, unstore=True, purge=True)
573 return
576def _dayobs_present(dataId):
577 return _get_dayobs_key(dataId) is not None
580def _seqnum_present(dataId):
581 return _get_seqnum_key(dataId) is not None
584def _expid_present(dataId):
585 return _get_expid_key(dataId) is not None
588def _get_dayobs_key(dataId):
589 """Return the key for day_obs if present, else None
590 """
591 keys = [k for k in dataId.keys() if k.find('day_obs') != -1]
592 if not keys:
593 return None
594 return keys[0]
597def _get_seqnum_key(dataId):
598 """Return the key for seq_num if present, else None
599 """
600 keys = [k for k in dataId.keys() if k.find('seq_num') != -1]
601 if not keys:
602 return None
603 return keys[0]
606def _get_expid_key(dataId):
607 """Return the key for expId if present, else None
608 """
609 if 'exposure.id' in dataId:
610 return 'exposure.id'
611 elif 'exposure' in dataId:
612 return 'exposure'
613 return None
616def getDayObs(dataId):
617 """Get the day_obs from a dataId.
619 Parameters
620 ----------
621 dataId : `dict`
622 The dataId.
624 Returns
625 -------
626 day_obs : `int` or `None`
627 The day_obs value if present, else None.
628 """
629 if not _dayobs_present(dataId):
630 return None
631 return dataId['day_obs'] if 'day_obs' in dataId else dataId['exposure.day_obs']
634def getSeqNum(dataId):
635 """Get the seq_num from a dataId.
637 Parameters
638 ----------
639 dataId : `dict`
640 The dataId.
642 Returns
643 -------
644 seq_num : `int` or `None`
645 The seq_num value if present, else None.
646 """
647 if not _seqnum_present(dataId):
648 return None
649 return dataId['seq_num'] if 'seq_num' in dataId else dataId['exposure.seq_num']
652def getExpId(dataId):
653 """Get the expId from a dataId.
655 Parameters
656 ----------
657 dataId : `dict`
658 The dataId.
660 Returns
661 -------
662 expId : `int` or `None`
663 The expId value if present, else None.
664 """
665 if not _expid_present(dataId):
666 return None
667 return dataId['exposure'] if 'exposure' in dataId else dataId['exposure.id']
670def getLatissOnSkyDataIds(butler, skipTypes=('bias', 'dark', 'flat'), checkObject=True, full=True,
671 startDate=None, endDate=None):
672 """Get a list of all on-sky dataIds taken.
674 Parameters
675 ----------
676 butler : `lsst.daf.butler.Butler`
677 The butler.
678 skipTypes : `list` of `str`
679 Image types to exclude.
680 checkObject : `bool`
681 Check if the value of target_name (formerly OBJECT) is set and exlude
682 if it is not.
683 full : `bool`
684 Return filled dataIds. Required for some analyses, but runs much
685 (~30x) slower.
686 startDate : `int`
687 The day_obs to start at, inclusive.
688 endDate : `int`
689 The day_obs to end at, inclusive.
691 Returns
692 -------
693 dataIds : `list` or `dataIds`
694 The dataIds.
695 """
696 def isOnSky(expRecord):
697 imageType = expRecord.observation_type
698 obj = expRecord.target_name
699 if checkObject and obj == 'NOTSET':
700 return False
701 if imageType not in skipTypes:
702 return True
703 return False
705 recordSets = []
706 days = getDaysWithData(butler)
707 if startDate:
708 days = [d for d in days if d >= startDate]
709 if endDate:
710 days = [d for d in days if d <= endDate]
711 days = sorted(set(days))
713 where = "exposure.day_obs=day_obs"
714 for day in days:
715 # queryDataIds would be better here, but it's then hard/impossible
716 # to do the filtering for which is on sky, so just take the dataIds
717 records = butler.registry.queryDimensionRecords("exposure",
718 where=where,
719 bind={'day_obs': day},
720 datasets='raw')
721 recordSets.append(sortRecordsByDayObsThenSeqNum(records))
723 dataIds = [r.dataId for r in filter(isOnSky, itertools.chain(*recordSets))]
724 if full:
725 expandedIds = [updateDataIdOrDataCord(butler.registry.expandDataId(dataId, detector=0).full)
726 for dataId in dataIds]
727 filledIds = [fillDataId(butler, dataId) for dataId in expandedIds]
728 return filledIds
729 else:
730 return [updateDataIdOrDataCord(dataId, detector=0) for dataId in dataIds]