Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 15%
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler",)
25import io
26import itertools
27from collections import defaultdict
28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union
30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetRef, DatasetType
31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
32from lsst.daf.butler.transfers import RepoExportContext
33from lsst.resources import ResourcePath, ResourcePathExpression
34from lsst.utils.introspection import get_class_of
36from .graph import QuantumGraph, QuantumNode
37from .pipeline import PipelineDatasetTypes
39DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]]
42def _accumulate(
43 graph: QuantumGraph,
44 dataset_types: PipelineDatasetTypes,
45) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
46 # accumulate the DatasetRefs that will be transferred to the execution
47 # registry
49 # exports holds all the existing data that will be migrated to the
50 # execution butler
51 exports: Set[DatasetRef] = set()
53 # inserts is the mapping of DatasetType to dataIds for what is to be
54 # inserted into the registry. These are the products that are expected
55 # to be produced during processing of the QuantumGraph
56 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)
58 # Add inserts for initOutputs (including initIntermediates); these are
59 # defined fully by their DatasetType, because they have no dimensions, and
60 # they are by definition not resolved. initInputs are part of Quantum and
61 # that's the only place the graph stores the dataset IDs, so we process
62 # them there even though each Quantum for a task has the same ones.
63 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs):
64 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe))
66 n: QuantumNode
67 for quantum in (n.quantum for n in graph):
68 for attrName in ("initInputs", "inputs", "outputs"):
69 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
71 for type, refs in attr.items():
72 # This if block is because init inputs has a different
73 # signature for its items
74 if not isinstance(refs, list):
75 refs = [refs]
76 # iterate over all the references, if it has an id, it
77 # means it exists and should be exported, if not it should
78 # be inserted into the new registry
79 for ref in refs:
80 if ref.id is not None:
81 # If this is a component we want the composite to be
82 # exported.
83 if ref.isComponent():
84 ref = ref.makeCompositeRef()
85 exports.add(ref)
86 else:
87 if ref.isComponent():
88 # We can't insert a component, and a component will
89 # be part of some other upstream dataset, so it
90 # should be safe to skip them here
91 continue
92 inserts[type].add(ref.dataId)
93 return exports, inserts
96def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
97 # Recurse through any discovered collections to make sure all collections
98 # are exported. This exists because I ran into a situation where some
99 # collections were not properly being discovered and exported. This
100 # method may be able to be removed in the future if collection export
101 # logic changes
102 collections = set(collections)
103 while True:
104 discoveredCollections = set(
105 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True)
106 )
107 if len(discoveredCollections) > len(collections):
108 collections = discoveredCollections
109 else:
110 break
111 return collections
114def _export(
115 butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], inserts: DataSetTypeMap
116) -> io.StringIO:
117 # This exports the datasets that exist in the input butler using
118 # daf butler objects, however it reaches in deep and does not use the
119 # public methods so that it can export it to a string buffer and skip
120 # disk access.
121 yamlBuffer = io.StringIO()
122 # Yaml is hard coded, since the class controls both ends of the
123 # export/import
124 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"])
125 backend = BackendClass(yamlBuffer)
126 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
128 # Need to ensure that the dimension records for input are transferred.
129 # Butler.transfer_from() does not (yet) transfer records.
130 dataIds = set(ref.dataId for ref in exports)
131 exporter.saveDataIds(dataIds)
133 # Need to ensure that the dimension records for outputs are
134 # transferred.
135 for _, dataIds in inserts.items():
136 exporter.saveDataIds(dataIds)
138 # Look for any defined collection, if not get the defaults
139 if collections is None:
140 collections = butler.registry.defaults.collections
142 # look up all collections associated with those inputs, this follows
143 # all chains to make sure everything is properly exported
144 for c in _discoverCollections(butler, collections):
145 exporter.saveCollection(c)
146 exporter._finish()
148 # reset the string buffer to the beginning so the read operation will
149 # actually *see* the data that was exported
150 yamlBuffer.seek(0)
151 return yamlBuffer
154def _setupNewButler(butler: Butler, outputLocation: ResourcePath, dirExists: bool) -> Butler:
155 # Set up the new butler object at the specified location
156 if dirExists:
157 # Remove the existing table, if the code got this far and this exists
158 # clobber must be true
159 executionRegistry = outputLocation.join("gen3.sqlite3")
160 if executionRegistry.exists():
161 executionRegistry.remove()
162 else:
163 outputLocation.mkdir()
165 # Copy the existing butler config, modifying the location of the
166 # registry to the specified location.
167 # Preserve the root path from the existing butler so things like
168 # file data stores continue to look at the old location.
169 config = Config(butler._config)
170 config["root"] = outputLocation.geturl()
171 config["allow_put_of_predefined_dataset"] = True
172 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
174 # Remove any namespace that may be set in main registry.
175 config.pop(("registry", "namespace"), None)
177 # record the current root of the datastore if it is specified relative
178 # to the butler root
179 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG:
180 config["datastore", "root"] = butler._config.configDir.geturl()
181 config["datastore", "trust_get_request"] = True
183 # Requires that we use the dimension configuration from the original
184 # butler and not use the defaults.
185 config = Butler.makeRepo(
186 root=outputLocation,
187 config=config,
188 dimensionConfig=butler.registry.dimensions.dimensionConfig,
189 overwrite=True,
190 forceConfigRoot=False,
191 )
193 # Return a newly created butler
194 return Butler(config, writeable=True)
197def _import(
198 yamlBuffer: io.StringIO,
199 newButler: Butler,
200 inserts: DataSetTypeMap,
201 run: str,
202 butlerModifier: Optional[Callable[[Butler], Butler]],
203) -> Butler:
204 # This method takes the exports from the existing butler, imports
205 # them into the newly created butler, and then inserts the datasets
206 # that are expected to be produced.
208 # import the existing datasets using "split" mode. "split" is safe
209 # because execution butler is assumed to be able to see all the file
210 # locations that the main datastore can see. "split" supports some
211 # absolute URIs in the datastore.
212 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split")
214 # If there is modifier callable, run it to make necessary updates
215 # to the new butler.
216 if butlerModifier is not None:
217 newButler = butlerModifier(newButler)
219 # Register datasets to be produced and insert them into the registry
220 for dsType, dataIds in inserts.items():
221 newButler.registry.registerDatasetType(dsType)
222 newButler.registry.insertDatasets(dsType, dataIds, run)
224 return newButler
227def buildExecutionButler(
228 butler: Butler,
229 graph: QuantumGraph,
230 outputLocation: ResourcePathExpression,
231 run: str,
232 *,
233 clobber: bool = False,
234 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
235 collections: Optional[Iterable[str]] = None,
236) -> Butler:
237 r"""buildExecutionButler is a function that is responsible for exporting
238 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
239 only contains datasets specified by the `QuantumGraph`. These datasets are
240 both those that already exist in the input `~lsst.daf.butler.Butler`, and
241 those that are expected to be produced during the execution of the
242 `QuantumGraph`.
244 Parameters
245 ----------
246 butler : `lsst.daf.butler.Bulter`
247 This is the existing `~lsst.daf.butler.Butler` instance from which
248 existing datasets will be exported. This should be the
249 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
250 that will be converted with this object.
251 graph : `QuantumGraph`
252 Graph containing nodes that are to be exported into an execution
253 butler
254 outputLocation : convertible to `ResourcePath
255 URI Location at which the execution butler is to be exported. May be
256 specified as a string or a `ResourcePath` instance.
257 run : `str` optional
258 The run collection that the exported datasets are to be placed in. If
259 None, the default value in registry.defaults will be used.
260 clobber : `bool`, Optional
261 By default a butler will not be created if a file or directory
262 already exists at the output location. If this is set to `True`
263 what is at the location will be deleted prior to running the
264 export. Defaults to `False`
265 butlerModifier : `~typing.Callable`, Optional
266 If supplied this should be a callable that accepts a
267 `~lsst.daf.butler.Butler`, and returns an instantiated
268 `~lsst.daf.butler.Butler`. This callable may be used to make any
269 modifications to the `~lsst.daf.butler.Butler` desired. This
270 will be called after importing all datasets that exist in the input
271 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
272 to be produced. Examples of what this method could do include
273 things such as creating collections/runs/ etc.
274 collections : `~typing.Iterable` of `str`, Optional
275 An iterable of collection names that will be exported from the input
276 `~lsst.daf.butler.Butler` when creating the execution butler. If not
277 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
278 default collections will be used.
280 Returns
281 -------
282 executionButler : `lsst.daf.butler.Butler`
283 An instance of the newly created execution butler
285 Raises
286 ------
287 FileExistsError
288 Raised if something exists in the filesystem at the specified output
289 location and clobber is `False`
290 NotADirectoryError
291 Raised if specified output URI does not correspond to a directory
292 """
293 # We know this must refer to a directory.
294 outputLocation = ResourcePath(outputLocation, forceDirectory=True)
296 # Do this first to Fail Fast if the output exists
297 if (dirExists := outputLocation.exists()) and not clobber:
298 raise FileExistsError("Cannot create a butler at specified location, location exists")
299 if not outputLocation.isdir():
300 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
302 # Gather all DatasetTypes from the Python and check any that already exist
303 # in the registry for consistency. This does not check that all dataset
304 # types here exist, because they might want to register dataset types
305 # later. It would be nice to also check that, but to that we would need to
306 # be told whether they plan to register dataset types later (DM-30845).
307 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry)
309 exports, inserts = _accumulate(graph, dataset_types)
310 yamlBuffer = _export(butler, collections, exports, inserts)
312 newButler = _setupNewButler(butler, outputLocation, dirExists)
314 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier)
316 # Transfer the existing datasets directly from the source butler.
317 newButler.transfer_from(
318 butler,
319 exports,
320 transfer="auto", # No transfers should be happening.
321 skip_missing=False, # Everything should exist.
322 register_dataset_types=True,
323 )
325 return newButler