Coverage for python/lsst/pipe/base/executionButlerBuilder.py : 15%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler", )
25import io
27from collections import defaultdict
28import itertools
29from typing import Callable, DefaultDict, Mapping, Optional, Set, Tuple, Iterable, List, Union
31from lsst.daf.butler import (DatasetRef, DatasetType, Butler, DataCoordinate, ButlerURI, Config)
32from lsst.daf.butler.core.utils import getClassOf
33from lsst.daf.butler.transfers import RepoExportContext
34from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
36from .graph import QuantumGraph, QuantumNode
37from .pipeline import PipelineDatasetTypes
39DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]]
42def _accumulate(
43 graph: QuantumGraph,
44 dataset_types: PipelineDatasetTypes,
45) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
46 # accumulate the DatasetRefs that will be transferred to the execution
47 # registry
49 # exports holds all the existing data that will be migrated to the
50 # execution butler
51 exports: Set[DatasetRef] = set()
53 # inserts is the mapping of DatasetType to dataIds for what is to be
54 # inserted into the registry. These are the products that are expected
55 # to be produced during processing of the QuantumGraph
56 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)
58 # Add inserts for initOutputs (including initIntermediates); these are
59 # defined fully by their DatasetType, because they have no dimensions, and
60 # they are by definition not resolved. initInputs are part of Quantum and
61 # that's the only place the graph stores the dataset IDs, so we process
62 # them there even though each Quantum for a task has the same ones.
63 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs):
64 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe))
66 n: QuantumNode
67 for quantum in (n.quantum for n in graph):
68 for attrName in ("initInputs", "inputs", "outputs"):
69 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
71 for type, refs in attr.items():
72 # This if block is because init inputs has a different
73 # signature for its items
74 if not isinstance(refs, list):
75 refs = [refs]
76 # iterate over all the references, if it has an id, it
77 # means it exists and should be exported, if not it should
78 # be inserted into the new registry
79 for ref in refs:
80 if ref.id is not None:
81 exports.add(ref)
82 else:
83 if ref.isComponent():
84 # We can't insert a component, and a component will
85 # be part of some other upstream dataset, so it
86 # should be safe to skip them here
87 continue
88 inserts[type].add(ref.dataId)
89 return exports, inserts
92def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
93 # Recurse through any discovered collections to make sure all collections
94 # are exported. This exists because I ran into a situation where some
95 # collections were not properly being discovered and exported. This
96 # method may be able to be removed in the future if collection export
97 # logic changes
98 collections = set(collections)
99 while True:
100 discoveredCollections = set(butler.registry.queryCollections(collections, flattenChains=True,
101 includeChains=True))
102 if len(discoveredCollections) > len(collections):
103 collections = discoveredCollections
104 else:
105 break
106 return collections
109def _export(butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef],
110 inserts: DataSetTypeMap) -> io.StringIO:
111 # This exports the datasets that exist in the input butler using
112 # daf butler objects, however it reaches in deep and does not use the
113 # public methods so that it can export it to a string buffer and skip
114 # disk access.
115 yamlBuffer = io.StringIO()
116 # Yaml is hard coded, since the class controls both ends of the
117 # export/import
118 BackendClass = getClassOf(butler._config["repo_transfer_formats", "yaml", "export"])
119 backend = BackendClass(yamlBuffer)
120 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
121 exporter.saveDatasets(exports)
123 # Need to ensure that the dimension records for outputs are
124 # transferred.
125 for _, dataIds in inserts.items():
126 exporter.saveDataIds(dataIds)
128 # Look for any defined collection, if not get the defaults
129 if collections is None:
130 collections = butler.registry.defaults.collections
132 # look up all collections associated with those inputs, this follows
133 # all chains to make sure everything is properly exported
134 for c in _discoverCollections(butler, collections):
135 exporter.saveCollection(c)
136 exporter._finish()
138 # reset the string buffer to the beginning so the read operation will
139 # actually *see* the data that was exported
140 yamlBuffer.seek(0)
141 return yamlBuffer
144def _setupNewButler(butler: Butler, outputLocation: ButlerURI, dirExists: bool) -> Butler:
145 # Set up the new butler object at the specified location
146 if dirExists:
147 # Remove the existing table, if the code got this far and this exists
148 # clobber must be true
149 executionRegistry = outputLocation.join("gen3.sqlite3")
150 if executionRegistry.exists():
151 executionRegistry.remove()
152 else:
153 outputLocation.mkdir()
155 # Copy the existing butler config, modifying the location of the
156 # registry to the specified location.
157 # Preserve the root path from the existing butler so things like
158 # file data stores continue to look at the old location.
159 config = Config(butler._config)
160 config["root"] = outputLocation.geturl()
161 config["allow_put_of_predefined_dataset"] = True
162 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
163 # record the current root of the datastore if it is specified relative
164 # to the butler root
165 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG:
166 config["datastore", "root"] = butler._config.configDir.geturl()
167 config["datastore", "trust_get_request"] = True
169 # Requires that we use the dimension configuration from the original
170 # butler and not use the defaults.
171 config = Butler.makeRepo(root=outputLocation, config=config,
172 dimensionConfig=butler.registry.dimensions.dimensionConfig,
173 overwrite=True, forceConfigRoot=False)
175 # Return a newly created butler
176 return Butler(config, writeable=True)
179def _import(yamlBuffer: io.StringIO,
180 newButler: Butler,
181 inserts: DataSetTypeMap,
182 run: str,
183 butlerModifier: Optional[Callable[[Butler], Butler]]
184 ) -> Butler:
185 # This method takes the exports from the existing butler, imports
186 # them into the newly created butler, and then inserts the datasets
187 # that are expected to be produced.
189 # import the existing datasets
190 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True)
192 # If there is modifier callable, run it to make necessary updates
193 # to the new butler.
194 if butlerModifier is not None:
195 newButler = butlerModifier(newButler)
197 # Register datasets to be produced and insert them into the registry
198 for dsType, dataIds in inserts.items():
199 newButler.registry.registerDatasetType(dsType)
200 newButler.registry.insertDatasets(dsType, dataIds, run)
202 return newButler
205def buildExecutionButler(butler: Butler,
206 graph: QuantumGraph,
207 outputLocation: Union[str, ButlerURI],
208 run: str,
209 *,
210 clobber: bool = False,
211 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
212 collections: Optional[Iterable[str]] = None
213 ) -> Butler:
214 r"""buildExecutionButler is a function that is responsible for exporting
215 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
216 only contains datasets specified by the `QuantumGraph`. These datasets are
217 both those that already exist in the input `~lsst.daf.butler.Butler`, and
218 those that are expected to be produced during the execution of the
219 `QuantumGraph`.
221 Parameters
222 ----------
223 butler : `lsst.daf.butler.Bulter`
224 This is the existing `~lsst.daf.butler.Butler` instance from which
225 existing datasets will be exported. This should be the
226 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
227 that will be converted with this object.
228 graph : `QuantumGraph`
229 Graph containing nodes that are to be exported into an execution
230 butler
231 outputLocation : `str` or `~lsst.daf.butler.ButlerURI`
232 URI Location at which the execution butler is to be exported. May be
233 specified as a string or a ButlerURI instance.
234 run : `str` optional
235 The run collection that the exported datasets are to be placed in. If
236 None, the default value in registry.defaults will be used.
237 clobber : `bool`, Optional
238 By default a butler will not be created if a file or directory
239 already exists at the output location. If this is set to `True`
240 what is at the location will be deleted prior to running the
241 export. Defaults to `False`
242 butlerModifier : `~typing.Callable`, Optional
243 If supplied this should be a callable that accepts a
244 `~lsst.daf.butler.Butler`, and returns an instantiated
245 `~lsst.daf.butler.Butler`. This callable may be used to make any
246 modifications to the `~lsst.daf.butler.Butler` desired. This
247 will be called after importing all datasets that exist in the input
248 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
249 to be produced. Examples of what this method could do include
250 things such as creating collections/runs/ etc.
251 collections : `~typing.Iterable` of `str`, Optional
252 An iterable of collection names that will be exported from the input
253 `~lsst.daf.butler.Butler` when creating the execution butler. If not
254 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
255 default collections will be used.
257 Returns
258 -------
259 executionButler : `lsst.daf.butler.Butler`
260 An instance of the newly created execution butler
262 Raises
263 ------
264 FileExistsError
265 Raised if something exists in the filesystem at the specified output
266 location and clobber is `False`
267 NotADirectoryError
268 Raised if specified output URI does not correspond to a directory
269 """
270 # We know this must refer to a directory.
271 outputLocation = ButlerURI(outputLocation, forceDirectory=True)
273 # Do this first to Fail Fast if the output exists
274 if (dirExists := outputLocation.exists()) and not clobber:
275 raise FileExistsError("Cannot create a butler at specified location, location exists")
276 if not outputLocation.isdir():
277 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
279 # Gather all DatasetTypes from the Python and check any that already exist
280 # in the registry for consistency. This does not check that all dataset
281 # types here exist, because they might want to register dataset types
282 # later. It would be nice to also check that, but to that we would need to
283 # be told whether they plan to register dataset types later (DM-30845).
284 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry)
286 exports, inserts = _accumulate(graph, dataset_types)
287 yamlBuffer = _export(butler, collections, exports, inserts)
289 newButler = _setupNewButler(butler, outputLocation, dirExists)
291 return _import(yamlBuffer, newButler, inserts, run, butlerModifier)