Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 15%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler",)
25import io
26import itertools
27from collections import defaultdict
28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union
30from lsst.daf.butler import Butler, ButlerURI, Config, DataCoordinate, DatasetRef, DatasetType
31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
32from lsst.daf.butler.transfers import RepoExportContext
33from lsst.utils.introspection import get_class_of
35from .graph import QuantumGraph, QuantumNode
36from .pipeline import PipelineDatasetTypes
38DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]]
41def _accumulate(
42 graph: QuantumGraph,
43 dataset_types: PipelineDatasetTypes,
44) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
45 # accumulate the DatasetRefs that will be transferred to the execution
46 # registry
48 # exports holds all the existing data that will be migrated to the
49 # execution butler
50 exports: Set[DatasetRef] = set()
52 # inserts is the mapping of DatasetType to dataIds for what is to be
53 # inserted into the registry. These are the products that are expected
54 # to be produced during processing of the QuantumGraph
55 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)
57 # Add inserts for initOutputs (including initIntermediates); these are
58 # defined fully by their DatasetType, because they have no dimensions, and
59 # they are by definition not resolved. initInputs are part of Quantum and
60 # that's the only place the graph stores the dataset IDs, so we process
61 # them there even though each Quantum for a task has the same ones.
62 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs):
63 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe))
65 n: QuantumNode
66 for quantum in (n.quantum for n in graph):
67 for attrName in ("initInputs", "inputs", "outputs"):
68 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
70 for type, refs in attr.items():
71 # This if block is because init inputs has a different
72 # signature for its items
73 if not isinstance(refs, list):
74 refs = [refs]
75 # iterate over all the references, if it has an id, it
76 # means it exists and should be exported, if not it should
77 # be inserted into the new registry
78 for ref in refs:
79 if ref.id is not None:
80 exports.add(ref)
81 else:
82 if ref.isComponent():
83 # We can't insert a component, and a component will
84 # be part of some other upstream dataset, so it
85 # should be safe to skip them here
86 continue
87 inserts[type].add(ref.dataId)
88 return exports, inserts
91def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
92 # Recurse through any discovered collections to make sure all collections
93 # are exported. This exists because I ran into a situation where some
94 # collections were not properly being discovered and exported. This
95 # method may be able to be removed in the future if collection export
96 # logic changes
97 collections = set(collections)
98 while True:
99 discoveredCollections = set(
100 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True)
101 )
102 if len(discoveredCollections) > len(collections):
103 collections = discoveredCollections
104 else:
105 break
106 return collections
109def _export(
110 butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], inserts: DataSetTypeMap
111) -> io.StringIO:
112 # This exports the datasets that exist in the input butler using
113 # daf butler objects, however it reaches in deep and does not use the
114 # public methods so that it can export it to a string buffer and skip
115 # disk access.
116 yamlBuffer = io.StringIO()
117 # Yaml is hard coded, since the class controls both ends of the
118 # export/import
119 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"])
120 backend = BackendClass(yamlBuffer)
121 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
122 exporter.saveDatasets(exports)
124 # Need to ensure that the dimension records for outputs are
125 # transferred.
126 for _, dataIds in inserts.items():
127 exporter.saveDataIds(dataIds)
129 # Look for any defined collection, if not get the defaults
130 if collections is None:
131 collections = butler.registry.defaults.collections
133 # look up all collections associated with those inputs, this follows
134 # all chains to make sure everything is properly exported
135 for c in _discoverCollections(butler, collections):
136 exporter.saveCollection(c)
137 exporter._finish()
139 # reset the string buffer to the beginning so the read operation will
140 # actually *see* the data that was exported
141 yamlBuffer.seek(0)
142 return yamlBuffer
145def _setupNewButler(butler: Butler, outputLocation: ButlerURI, dirExists: bool) -> Butler:
146 # Set up the new butler object at the specified location
147 if dirExists:
148 # Remove the existing table, if the code got this far and this exists
149 # clobber must be true
150 executionRegistry = outputLocation.join("gen3.sqlite3")
151 if executionRegistry.exists():
152 executionRegistry.remove()
153 else:
154 outputLocation.mkdir()
156 # Copy the existing butler config, modifying the location of the
157 # registry to the specified location.
158 # Preserve the root path from the existing butler so things like
159 # file data stores continue to look at the old location.
160 config = Config(butler._config)
161 config["root"] = outputLocation.geturl()
162 config["allow_put_of_predefined_dataset"] = True
163 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
165 # Remove any namespace that may be set in main registry.
166 config.pop(("registry", "namespace"), None)
168 # record the current root of the datastore if it is specified relative
169 # to the butler root
170 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG:
171 config["datastore", "root"] = butler._config.configDir.geturl()
172 config["datastore", "trust_get_request"] = True
174 # Requires that we use the dimension configuration from the original
175 # butler and not use the defaults.
176 config = Butler.makeRepo(
177 root=outputLocation,
178 config=config,
179 dimensionConfig=butler.registry.dimensions.dimensionConfig,
180 overwrite=True,
181 forceConfigRoot=False,
182 )
184 # Return a newly created butler
185 return Butler(config, writeable=True)
188def _import(
189 yamlBuffer: io.StringIO,
190 newButler: Butler,
191 inserts: DataSetTypeMap,
192 run: str,
193 butlerModifier: Optional[Callable[[Butler], Butler]],
194) -> Butler:
195 # This method takes the exports from the existing butler, imports
196 # them into the newly created butler, and then inserts the datasets
197 # that are expected to be produced.
199 # import the existing datasets using "split" mode. "split" is safe
200 # because execution butler is assumed to be able to see all the file
201 # locations that the main datastore can see. "split" supports some
202 # absolute URIs in the datastore.
203 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split")
205 # If there is modifier callable, run it to make necessary updates
206 # to the new butler.
207 if butlerModifier is not None:
208 newButler = butlerModifier(newButler)
210 # Register datasets to be produced and insert them into the registry
211 for dsType, dataIds in inserts.items():
212 newButler.registry.registerDatasetType(dsType)
213 newButler.registry.insertDatasets(dsType, dataIds, run)
215 return newButler
218def buildExecutionButler(
219 butler: Butler,
220 graph: QuantumGraph,
221 outputLocation: Union[str, ButlerURI],
222 run: str,
223 *,
224 clobber: bool = False,
225 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
226 collections: Optional[Iterable[str]] = None,
227) -> Butler:
228 r"""buildExecutionButler is a function that is responsible for exporting
229 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
230 only contains datasets specified by the `QuantumGraph`. These datasets are
231 both those that already exist in the input `~lsst.daf.butler.Butler`, and
232 those that are expected to be produced during the execution of the
233 `QuantumGraph`.
235 Parameters
236 ----------
237 butler : `lsst.daf.butler.Bulter`
238 This is the existing `~lsst.daf.butler.Butler` instance from which
239 existing datasets will be exported. This should be the
240 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
241 that will be converted with this object.
242 graph : `QuantumGraph`
243 Graph containing nodes that are to be exported into an execution
244 butler
245 outputLocation : `str` or `~lsst.daf.butler.ButlerURI`
246 URI Location at which the execution butler is to be exported. May be
247 specified as a string or a ButlerURI instance.
248 run : `str` optional
249 The run collection that the exported datasets are to be placed in. If
250 None, the default value in registry.defaults will be used.
251 clobber : `bool`, Optional
252 By default a butler will not be created if a file or directory
253 already exists at the output location. If this is set to `True`
254 what is at the location will be deleted prior to running the
255 export. Defaults to `False`
256 butlerModifier : `~typing.Callable`, Optional
257 If supplied this should be a callable that accepts a
258 `~lsst.daf.butler.Butler`, and returns an instantiated
259 `~lsst.daf.butler.Butler`. This callable may be used to make any
260 modifications to the `~lsst.daf.butler.Butler` desired. This
261 will be called after importing all datasets that exist in the input
262 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
263 to be produced. Examples of what this method could do include
264 things such as creating collections/runs/ etc.
265 collections : `~typing.Iterable` of `str`, Optional
266 An iterable of collection names that will be exported from the input
267 `~lsst.daf.butler.Butler` when creating the execution butler. If not
268 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
269 default collections will be used.
271 Returns
272 -------
273 executionButler : `lsst.daf.butler.Butler`
274 An instance of the newly created execution butler
276 Raises
277 ------
278 FileExistsError
279 Raised if something exists in the filesystem at the specified output
280 location and clobber is `False`
281 NotADirectoryError
282 Raised if specified output URI does not correspond to a directory
283 """
284 # We know this must refer to a directory.
285 outputLocation = ButlerURI(outputLocation, forceDirectory=True)
287 # Do this first to Fail Fast if the output exists
288 if (dirExists := outputLocation.exists()) and not clobber:
289 raise FileExistsError("Cannot create a butler at specified location, location exists")
290 if not outputLocation.isdir():
291 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
293 # Gather all DatasetTypes from the Python and check any that already exist
294 # in the registry for consistency. This does not check that all dataset
295 # types here exist, because they might want to register dataset types
296 # later. It would be nice to also check that, but to that we would need to
297 # be told whether they plan to register dataset types later (DM-30845).
298 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry)
300 exports, inserts = _accumulate(graph, dataset_types)
301 yamlBuffer = _export(butler, collections, exports, inserts)
303 newButler = _setupNewButler(butler, outputLocation, dirExists)
305 return _import(yamlBuffer, newButler, inserts, run, butlerModifier)