Coverage for python/lsst/pipe/base/executionButlerBuilder.py : 14%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler", )
25import io
27from collections import defaultdict
28from typing import Callable, DefaultDict, Mapping, Optional, Set, Tuple, Iterable, List, Union
30from lsst.daf.butler import (DatasetRef, DatasetType, Butler, DataCoordinate, ButlerURI, Config)
31from lsst.daf.butler.core.utils import getClassOf
32from lsst.daf.butler.transfers import RepoExportContext
33from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
36from . import QuantumGraph, QuantumNode
38DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]]
41def _accumulate(graph: QuantumGraph) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
42 # accumulate the dataIds that will be transferred to the execution
43 # registry
45 # exports holds all the existing data that will be migrated to the
46 # execution butler
47 exports: Set[DatasetRef] = set()
49 # inserts is the mapping of DatasetType to dataIds for what is to be
50 # inserted into the registry. These are the products that are expected
51 # to be produced during processing of the QuantumGraph
52 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)
54 n: QuantumNode
55 for quantum in (n.quantum for n in graph):
56 for attrName in ("initInputs", "inputs", "outputs"):
57 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
59 for type, refs in attr.items():
60 # This if block is because init inputs has a different
61 # signature for its items
62 if not isinstance(refs, list):
63 refs = [refs]
64 # iterate over all the references, if it has an id, it
65 # means it exists and should be exported, if not it should
66 # be inserted into the new registry
67 for ref in refs:
68 if ref.isComponent():
69 # We can't insert a component, and a component will
70 # be part of some other upstream dataset, so it
71 # should be safe to skip them here
72 continue
74 if ref.id is not None:
75 exports.add(ref)
76 else:
77 inserts[type].add(ref.dataId)
78 return exports, inserts
81def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
82 # Recurse through any discovered collections to make sure all collections
83 # are exported. This exists because I ran into a situation where some
84 # collections were not properly being discovered and exported. This
85 # method may be able to be removed in the future if collection export
86 # logic changes
87 collections = set(collections)
88 while True:
89 discoveredCollections = set(butler.registry.queryCollections(collections, flattenChains=True,
90 includeChains=True))
91 if len(discoveredCollections) > len(collections):
92 collections = discoveredCollections
93 else:
94 break
95 return collections
98def _export(butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef],
99 inserts: DataSetTypeMap) -> io.StringIO:
100 # This exports the datasets that exist in the input butler using
101 # daf butler objects, however it reaches in deep and does not use the
102 # public methods so that it can export it to a string buffer and skip
103 # disk access.
104 yamlBuffer = io.StringIO()
105 # Yaml is hard coded, since the class controls both ends of the
106 # export/import
107 BackendClass = getClassOf(butler._config["repo_transfer_formats", "yaml", "export"])
108 backend = BackendClass(yamlBuffer)
109 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
110 exporter.saveDatasets(exports)
112 # Need to ensure that the dimension records for outputs are
113 # transferred.
114 for _, dataIds in inserts.items():
115 exporter.saveDataIds(dataIds)
117 # Look for any defined collection, if not get the defaults
118 if collections is None:
119 collections = butler.registry.defaults.collections
121 # look up all collections associated with those inputs, this follows
122 # all chains to make sure everything is properly exported
123 for c in _discoverCollections(butler, collections):
124 exporter.saveCollection(c)
125 exporter._finish()
127 # reset the string buffer to the beginning so the read operation will
128 # actually *see* the data that was exported
129 yamlBuffer.seek(0)
130 return yamlBuffer
133def _setupNewButler(butler: Butler, outputLocation: ButlerURI, dirExists: bool) -> Butler:
134 # Set up the new butler object at the specified location
135 if dirExists:
136 # Remove the existing table, if the code got this far and this exists
137 # clobber must be true
138 executionRegistry = outputLocation.join("gen3.sqlite3")
139 if executionRegistry.exists():
140 executionRegistry.remove()
141 else:
142 outputLocation.mkdir()
144 # Copy the existing butler config, modifying the location of the
145 # registry to the specified location.
146 # Preserve the root path from the existing butler so things like
147 # file data stores continue to look at the old location.
148 config = Config(butler._config)
149 config["root"] = outputLocation.geturl()
150 config["allow_put_of_predefined_dataset"] = True
151 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
152 # record the current root of the datastore if it is specified relative
153 # to the butler root
154 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG:
155 config["datastore", "root"] = butler._config.configDir.geturl()
156 config["datastore", "trust_get_request"] = True
158 # Requires that we use the dimension configuration from the original
159 # butler and not use the defaults.
160 config = Butler.makeRepo(root=outputLocation, config=config,
161 dimensionConfig=butler.registry.dimensions.dimensionConfig,
162 overwrite=True, forceConfigRoot=False)
164 # Return a newly created butler
165 return Butler(config, writeable=True)
168def _import(yamlBuffer: io.StringIO,
169 newButler: Butler,
170 inserts: DataSetTypeMap,
171 run: str,
172 butlerModifier: Optional[Callable[[Butler], Butler]]
173 ) -> Butler:
174 # This method takes the exports from the existing butler, imports
175 # them into the newly created butler, and then inserts the datasets
176 # that are expected to be produced.
178 # import the existing datasets
179 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True)
181 # If there is modifier callable, run it to make necessary updates
182 # to the new butler.
183 if butlerModifier is not None:
184 newButler = butlerModifier(newButler)
186 # Register datasets to be produced and insert them into the registry
187 for dsType, dataIds in inserts.items():
188 newButler.registry.registerDatasetType(dsType)
189 newButler.registry.insertDatasets(dsType, dataIds, run)
191 return newButler
194def buildExecutionButler(butler: Butler,
195 graph: QuantumGraph,
196 outputLocation: Union[str, ButlerURI],
197 run: str,
198 *,
199 clobber: bool = False,
200 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
201 collections: Optional[Iterable[str]] = None
202 ) -> Butler:
203 r"""buildExecutionButler is a function that is responsible for exporting
204 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
205 only contains datasets specified by the `QuantumGraph`. These datasets are
206 both those that already exist in the input `~lsst.daf.butler.Butler`, and
207 those that are expected to be produced during the execution of the
208 `QuantumGraph`.
210 Parameters
211 ----------
212 butler : `lsst.daf.butler.Bulter`
213 This is the existing `~lsst.daf.butler.Butler` instance from which
214 existing datasets will be exported. This should be the
215 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
216 that will be converted with this object.
217 graph : `QuantumGraph`
218 Graph containing nodes that are to be exported into an execution
219 butler
220 outputLocation : `str` or `~lsst.daf.butler.ButlerURI`
221 URI Location at which the execution butler is to be exported. May be
222 specified as a string or a ButlerURI instance.
223 run : `str` optional
224 The run collection that the exported datasets are to be placed in. If
225 None, the default value in registry.defaults will be used.
226 clobber : `bool`, Optional
227 By default a butler will not be created if a file or directory
228 already exists at the output location. If this is set to `True`
229 what is at the location will be deleted prior to running the
230 export. Defaults to `False`
231 butlerModifier : `~typing.Callable`, Optional
232 If supplied this should be a callable that accepts a
233 `~lsst.daf.butler.Butler`, and returns an instantiated
234 `~lsst.daf.butler.Butler`. This callable may be used to make any
235 modifications to the `~lsst.daf.butler.Butler` desired. This
236 will be called after importing all datasets that exist in the input
237 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
238 to be produced. Examples of what this method could do include
239 things such as creating collections/runs/ etc.
240 collections : `~typing.Iterable` of `str`, Optional
241 An iterable of collection names that will be exported from the input
242 `~lsst.daf.butler.Butler` when creating the execution butler. If not
243 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
244 default collections will be used.
246 Returns
247 -------
248 executionButler : `lsst.daf.butler.Butler`
249 An instance of the newly created execution butler
251 Raises
252 ------
253 FileExistsError
254 Raised if something exists in the filesystem at the specified output
255 location and clobber is `False`
256 NotADirectoryError
257 Raised if specified output URI does not correspond to a directory
258 """
259 # We know this must refer to a directory.
260 outputLocation = ButlerURI(outputLocation, forceDirectory=True)
262 # Do this first to Fail Fast if the output exists
263 if (dirExists := outputLocation.exists()) and not clobber:
264 raise FileExistsError("Cannot create a butler at specified location, location exists")
265 if not outputLocation.isdir():
266 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
268 exports, inserts = _accumulate(graph)
269 yamlBuffer = _export(butler, collections, exports, inserts)
271 newButler = _setupNewButler(butler, outputLocation, dirExists)
273 return _import(yamlBuffer, newButler, inserts, run, butlerModifier)