Coverage for python/lsst/pipe/base/executionButlerBuilder.py : 15%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler", )
25import io
27from collections import defaultdict
28from typing import Callable, DefaultDict, Mapping, Optional, Set, Tuple, Iterable, List, Union
30from lsst.daf.butler import (DatasetRef, DatasetType, Butler, DataCoordinate, ButlerURI, Config)
31from lsst.daf.butler.core.utils import getClassOf
32from lsst.daf.butler.transfers import RepoExportContext
33from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
36from . import QuantumGraph, QuantumNode
38DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]]
41def _accumulate(graph: QuantumGraph) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
42 # accumulate the dataIds that will be transferred to the execution
43 # registry
45 # exports holds all the existing data that will be migrated to the
46 # execution butler
47 exports: Set[DatasetRef] = set()
49 # inserts is the mapping of DatasetType to dataIds for what is to be
50 # inserted into the registry. These are the products that are expected
51 # to be produced during processing of the QuantumGraph
52 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)
54 n: QuantumNode
55 for quantum in (n.quantum for n in graph):
56 for attrName in ("initInputs", "inputs", "outputs"):
57 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
59 for type, refs in attr.items():
60 # This if block is because init inputs has a different
61 # signature for its items
62 if not isinstance(refs, list):
63 refs = [refs]
64 # iterate over all the references, if it has an id, it
65 # means it exists and should be exported, if not it should
66 # be inserted into the new registry
67 for ref in refs:
68 if ref.isComponent():
69 # We can't insert a component, and a component will
70 # be part of some other upstream dataset, so it
71 # should be safe to skip them here
72 continue
74 if ref.id is not None:
75 exports.add(ref)
76 else:
77 inserts[type].add(ref.dataId)
78 return exports, inserts
81def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
82 # Recurse through any discovered collections to make sure all collections
83 # are exported. This exists because I ran into a situation where some
84 # collections were not properly being discovered and exported. This
85 # method may be able to be removed in the future if collection export
86 # logic changes
87 collections = set(collections)
88 while True:
89 discoveredCollections = set(butler.registry.queryCollections(collections, flattenChains=True,
90 includeChains=True))
91 if len(discoveredCollections) > len(collections):
92 collections = discoveredCollections
93 else:
94 break
95 return collections
98def _export(butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef]) -> io.StringIO:
99 # This exports the datasets that exist in the input butler using
100 # daf butler objects, however it reaches in deep and does not use the
101 # public methods so that it can export it to a string buffer and skip
102 # disk access.
103 yamlBuffer = io.StringIO()
104 # Yaml is hard coded, since the class controls both ends of the
105 # export/import
106 BackendClass = getClassOf(butler._config["repo_transfer_formats", "yaml", "export"])
107 backend = BackendClass(yamlBuffer)
108 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
109 exporter.saveDatasets(exports)
111 # Look for any defined collection, if not get the defaults
112 if collections is None:
113 collections = butler.registry.defaults.collections
115 # look up all collections associated with those inputs, this follows
116 # all chains to make sure everything is properly exported
117 for c in _discoverCollections(butler, collections):
118 exporter.saveCollection(c)
119 exporter._finish()
121 # reset the string buffer to the beginning so the read operation will
122 # actually *see* the data that was exported
123 yamlBuffer.seek(0)
124 return yamlBuffer
127def _setupNewButler(butler: Butler, outputLocation: ButlerURI, dirExists: bool) -> Butler:
128 # Set up the new butler object at the specified location
129 if dirExists:
130 # Remove the existing table, if the code got this far and this exists
131 # clobber must be true
132 executionRegistry = outputLocation.join("gen3.sqlite3")
133 if executionRegistry.exists():
134 executionRegistry.remove()
135 else:
136 outputLocation.mkdir()
138 # Copy the existing butler config, modifying the location of the
139 # registry to the specified location.
140 # Preserve the root path from the existing butler so things like
141 # file data stores continue to look at the old location.
142 config = Config(butler._config)
143 config["root"] = outputLocation.geturl()
144 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
145 # record the current root of the datastore if it is specified relative
146 # to the butler root
147 if config.get(("datastore", "root")) == BUTLER_ROOT_TAG:
148 config["datastore", "root"] = butler._config.configDir.geturl()
149 config["datastore", "trust_get_request"] = True
151 config = Butler.makeRepo(root=outputLocation, config=config, overwrite=True, forceConfigRoot=False)
153 # Return a newly created butler
154 return Butler(config, writeable=True)
157def _import(yamlBuffer: io.StringIO,
158 newButler: Butler,
159 inserts: DataSetTypeMap,
160 run: str,
161 butlerModifier: Optional[Callable[[Butler], Butler]]
162 ) -> Butler:
163 # This method takes the exports from the existing butler, imports
164 # them into the newly created butler, and then inserts the datasets
165 # that are expected to be produced.
167 # import the existing datasets
168 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True)
170 # If there is modifier callable, run it to make necessary updates
171 # to the new butler.
172 if butlerModifier is not None:
173 newButler = butlerModifier(newButler)
175 # Register datasets to be produced and insert them into the registry
176 for dsType, dataIds in inserts.items():
177 newButler.registry.registerDatasetType(dsType)
178 newButler.registry.insertDatasets(dsType, dataIds, run)
180 return newButler
183def buildExecutionButler(butler: Butler,
184 graph: QuantumGraph,
185 outputLocation: Union[str, ButlerURI],
186 run: str,
187 *,
188 clobber: bool = False,
189 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
190 collections: Optional[Iterable[str]] = None
191 ) -> Butler:
192 r"""buildExecutionButler is a function that is responsible for exporting
193 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
194 only contains datasets specified by the `QuantumGraph`. These datasets are
195 both those that already exist in the input `~lsst.daf.butler.Butler`, and
196 those that are expected to be produced during the execution of the
197 `QuantumGraph`.
199 Parameters
200 ----------
201 butler : `lsst.daf.butler.Bulter`
202 This is the existing `~lsst.daf.butler.Butler` instance from which
203 existing datasets will be exported. This should be the
204 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
205 that will be converted with this object.
206 graph : `QuantumGraph`
207 Graph containing nodes that are to be exported into an execution
208 butler
209 outputLocation : `str` or `~lsst.daf.butler.ButlerURI`
210 URI Location at which the execution butler is to be exported. May be
211 specified as a string or a ButlerURI instance.
212 run : `str` optional
213 The run collection that the exported datasets are to be placed in. If
214 None, the default value in registry.defaults will be used.
215 clobber : `bool`, Optional
216 By default a butler will not be created if a file or directory
217 already exists at the output location. If this is set to `True`
218 what is at the location will be deleted prior to running the
219 export. Defaults to `False`
220 butlerModifier : `~typing.Callable`, Optional
221 If supplied this should be a callable that accepts a
222 `~lsst.daf.butler.Butler`, and returns an instantiated
223 `~lsst.daf.butler.Butler`. This callable may be used to make any
224 modifications to the `~lsst.daf.butler.Butler` desired. This
225 will be called after importing all datasets that exist in the input
226 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
227 to be produced. Examples of what this method could do include
228 things such as creating collections/runs/ etc.
229 collections : `~typing.Iterable` of `str`, Optional
230 An iterable of collection names that will be exported from the input
231 `~lsst.daf.butler.Butler` when creating the execution butler. If not
232 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
233 default collections will be used.
235 Returns
236 -------
237 executionButler : `lsst.daf.butler.Butler`
238 An instance of the newly created execution butler
240 Raises
241 ------
242 FileExistsError
243 Raised if something exists in the filesystem at the specified output
244 location and clobber is `False`
245 NotADirectoryError
246 Raised if specified output URI does not correspond to a directory
247 """
248 outputLocation = ButlerURI(outputLocation)
250 # Do this first to Fail Fast if the output exists
251 if (dirExists := outputLocation.exists()) and not clobber:
252 raise FileExistsError("Cannot create a butler at specified location, location exists")
253 if not outputLocation.isdir():
254 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
256 exports, inserts = _accumulate(graph)
257 yamlBuffer = _export(butler, collections, exports)
259 newButler = _setupNewButler(butler, outputLocation, dirExists)
261 return _import(yamlBuffer, newButler, inserts, run, butlerModifier)