Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 11%
137 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-09 02:09 -0700
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-09 02:09 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler",)
25import io
26import itertools
27from collections import defaultdict
28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union
30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetRef, DatasetType
31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
32from lsst.daf.butler.registry import ConflictingDefinitionError
33from lsst.daf.butler.transfers import RepoExportContext
34from lsst.resources import ResourcePath, ResourcePathExpression
35from lsst.utils.introspection import get_class_of
37from .graph import QuantumGraph
38from .pipeline import PipelineDatasetTypes
40DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]]
43def _validate_dataset_type(
44 candidate: DatasetType, previous: dict[Union[str, DatasetType], DatasetType]
45) -> DatasetType:
46 """Check the dataset types and return a consistent variant if there are
47 different compatible options.
49 Parameters
50 ----------
51 candidate : `lsst.daf.butler.DatasetType`
52 The candidate dataset type.
53 previous : `dict` [Union[`str`, `DatasetType`], `DatasetType`]
54 Previous dataset types found, indexed by name and also by
55 dataset type. The latter provides a quick way of returning a
56 previously checked dataset type.
58 Returns
59 -------
60 datasetType : `lsst.daf.butler.DatasetType`
61 The dataset type to be used. This can be different from the
62 given ``candidate`` if a previous dataset type was encountered
63 with the same name and this one is compatible with it.
65 Raises
66 ------
67 ConflictingDefinitionError
68 Raised if a candidate dataset type has the same name as one
69 previously encountered but is not compatible with it.
71 Notes
72 -----
73 This function ensures that if a dataset type is given that has the
74 same name as a previously encountered dataset type but differs solely
75 in a way that is interchangeable (through a supported storage class)
76 then we will always return the first dataset type encountered instead
77 of the new variant. We assume that the butler will handle the
78 type conversion itself later.
79 """
80 # First check that if we have previously vetted this dataset type.
81 # Return the vetted form immediately if we have.
82 checked = previous.get(candidate)
83 if checked:
84 return checked
86 # Have not previously encountered this dataset type.
87 name = candidate.name
88 if prevDsType := previous.get(name):
89 # Check compatibility. For now assume both directions have to
90 # be acceptable.
91 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType):
92 # Ensure that if this dataset type is used again we will return
93 # the version that we were first given with this name. Store
94 # it for next time and return the previous one.
95 previous[candidate] = prevDsType
96 return prevDsType
97 else:
98 raise ConflictingDefinitionError(
99 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}"
100 )
102 # New dataset type encountered. Store it by name and by dataset type
103 # so it will be validated immediately next time it comes up.
104 previous[name] = candidate
105 previous[candidate] = candidate
106 return candidate
109def _accumulate(
110 butler: Butler,
111 graph: QuantumGraph,
112 dataset_types: PipelineDatasetTypes,
113) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
114 # accumulate the DatasetRefs that will be transferred to the execution
115 # registry
117 # exports holds all the existing data that will be migrated to the
118 # execution butler
119 exports: Set[DatasetRef] = set()
121 # inserts is the mapping of DatasetType to dataIds for what is to be
122 # inserted into the registry. These are the products that are expected
123 # to be produced during processing of the QuantumGraph
124 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)
126 # It is possible to end up with a graph that has different storage
127 # classes attached to the same dataset type name. This is okay but
128 # must we must ensure that only a single dataset type definition is
129 # accumulated in the loop below. This data structure caches every dataset
130 # type encountered and stores the compatible alternative.
131 datasetTypes: dict[Union[str, DatasetType], DatasetType] = {}
133 # Add inserts for initOutputs (including initIntermediates); these are
134 # defined fully by their DatasetType, because they have no dimensions.
135 # initInputs are part of Quantum and that's the only place the graph stores
136 # the dataset IDs, so we process them there even though each Quantum for a
137 # task has the same ones.
138 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs):
139 dataset_type = _validate_dataset_type(dataset_type, datasetTypes)
140 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe))
142 # Output references may be resolved even if they do not exist. Find all
143 # actually existing refs.
144 check_refs: Set[DatasetRef] = set()
145 for quantum in (n.quantum for n in graph):
146 for attrName in ("initInputs", "inputs", "outputs"):
147 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
148 for type, refs in attr.items():
149 # This if block is because init inputs has a different
150 # signature for its items
151 if not isinstance(refs, list):
152 refs = [refs]
153 for ref in refs:
154 if ref.id is not None:
155 # We could check existence of individual components,
156 # but it should be less work to check their parent.
157 if ref.isComponent():
158 ref = ref.makeCompositeRef()
159 check_refs.add(ref)
160 exist_map = butler.datastore.knows_these(check_refs)
161 existing_ids = set(ref.id for ref, exists in exist_map.items() if exists)
162 del exist_map
164 for quantum in (n.quantum for n in graph):
165 for attrName in ("initInputs", "inputs", "outputs"):
166 attr = getattr(quantum, attrName)
168 for type, refs in attr.items():
169 if not isinstance(refs, list):
170 refs = [refs]
171 # iterate over all the references, if it exists and should be
172 # exported, if not it should be inserted into the new registry
173 for ref in refs:
174 # Component dataset ID is the same as its parent ID, so
175 # checking component in existing_ids works OK.
176 if ref.id is not None and ref.id in existing_ids:
177 # If this is a component we want the composite to be
178 # exported.
179 if ref.isComponent():
180 ref = ref.makeCompositeRef()
181 exports.add(ref)
182 else:
183 if ref.isComponent():
184 # We can't insert a component, and a component will
185 # be part of some other upstream dataset, so it
186 # should be safe to skip them here
187 continue
188 type = _validate_dataset_type(type, datasetTypes)
189 inserts[type].add(ref.dataId)
190 return exports, inserts
193def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
194 # Recurse through any discovered collections to make sure all collections
195 # are exported. This exists because I ran into a situation where some
196 # collections were not properly being discovered and exported. This
197 # method may be able to be removed in the future if collection export
198 # logic changes
199 collections = set(collections)
200 while True:
201 discoveredCollections = set(
202 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True)
203 )
204 if len(discoveredCollections) > len(collections):
205 collections = discoveredCollections
206 else:
207 break
208 return collections
211def _export(
212 butler: Butler, collections: Optional[Iterable[str]], exports: Set[DatasetRef], inserts: DataSetTypeMap
213) -> io.StringIO:
214 # This exports the datasets that exist in the input butler using
215 # daf butler objects, however it reaches in deep and does not use the
216 # public methods so that it can export it to a string buffer and skip
217 # disk access.
218 yamlBuffer = io.StringIO()
219 # Yaml is hard coded, since the class controls both ends of the
220 # export/import
221 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"])
222 backend = BackendClass(yamlBuffer, universe=butler.registry.dimensions)
223 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
225 # Need to ensure that the dimension records for outputs are
226 # transferred.
227 for _, dataIds in inserts.items():
228 exporter.saveDataIds(dataIds)
230 # Look for any defined collection, if not get the defaults
231 if collections is None:
232 collections = butler.registry.defaults.collections
234 # look up all collections associated with those inputs, this follows
235 # all chains to make sure everything is properly exported
236 for c in _discoverCollections(butler, collections):
237 exporter.saveCollection(c)
238 exporter._finish()
240 # reset the string buffer to the beginning so the read operation will
241 # actually *see* the data that was exported
242 yamlBuffer.seek(0)
243 return yamlBuffer
246def _setupNewButler(
247 butler: Butler,
248 outputLocation: ResourcePath,
249 dirExists: bool,
250 datastoreRoot: Optional[ResourcePath] = None,
251) -> Butler:
252 """Set up the execution butler
254 Parameters
255 ----------
256 butler : `Butler`
257 The original butler, upon which the execution butler is based.
258 outputLocation : `ResourcePath`
259 Location of the execution butler.
260 dirExists : `bool`
261 Does the ``outputLocation`` exist, and if so, should it be clobbered?
262 datastoreRoot : `ResourcePath`, optional
263 Path for the execution butler datastore. If not specified, then the
264 original butler's datastore will be used.
266 Returns
267 -------
268 execution_butler : `Butler`
269 Execution butler.
270 """
271 # Set up the new butler object at the specified location
272 if dirExists:
273 # Remove the existing table, if the code got this far and this exists
274 # clobber must be true
275 executionRegistry = outputLocation.join("gen3.sqlite3")
276 if executionRegistry.exists():
277 executionRegistry.remove()
278 else:
279 outputLocation.mkdir()
281 # Copy the existing butler config, modifying the location of the
282 # registry to the specified location.
283 # Preserve the root path from the existing butler so things like
284 # file data stores continue to look at the old location.
285 config = Config(butler._config)
286 config["root"] = outputLocation.geturl()
287 config["allow_put_of_predefined_dataset"] = True
288 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
290 # Remove any namespace that may be set in main registry.
291 config.pop(("registry", "namespace"), None)
293 # record the current root of the datastore if it is specified relative
294 # to the butler root
295 if datastoreRoot is not None:
296 config["datastore", "root"] = datastoreRoot.geturl()
297 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None:
298 config["datastore", "root"] = butler._config.configDir.geturl()
299 config["datastore", "trust_get_request"] = True
301 # Requires that we use the dimension configuration from the original
302 # butler and not use the defaults.
303 config = Butler.makeRepo(
304 root=outputLocation,
305 config=config,
306 dimensionConfig=butler.registry.dimensions.dimensionConfig,
307 overwrite=True,
308 forceConfigRoot=False,
309 )
311 # Return a newly created butler
312 return Butler(config, writeable=True)
315def _import(
316 yamlBuffer: io.StringIO,
317 newButler: Butler,
318 inserts: DataSetTypeMap,
319 run: Optional[str],
320 butlerModifier: Optional[Callable[[Butler], Butler]],
321) -> Butler:
322 # This method takes the exports from the existing butler, imports
323 # them into the newly created butler, and then inserts the datasets
324 # that are expected to be produced.
326 # import the existing datasets using "split" mode. "split" is safe
327 # because execution butler is assumed to be able to see all the file
328 # locations that the main datastore can see. "split" supports some
329 # absolute URIs in the datastore.
330 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split")
332 # If there is modifier callable, run it to make necessary updates
333 # to the new butler.
334 if butlerModifier is not None:
335 newButler = butlerModifier(newButler)
337 # Register datasets to be produced and insert them into the registry
338 for dsType, dataIds in inserts.items():
339 # There may be inconsistencies with storage class definitions
340 # so those differences must be checked.
341 try:
342 newButler.registry.registerDatasetType(dsType)
343 except ConflictingDefinitionError:
344 # We do not at this point know whether the dataset type is
345 # an intermediate (and so must be able to support conversion
346 # from the registry storage class to an input) or solely an output
347 # dataset type. Test both compatibilities.
348 registryDsType = newButler.registry.getDatasetType(dsType.name)
349 if registryDsType.is_compatible_with(dsType) and dsType.is_compatible_with(registryDsType):
350 # Ensure that we use the registry type when inserting.
351 dsType = registryDsType
352 else:
353 # Not compatible so re-raise the original exception.
354 raise
356 newButler.registry.insertDatasets(dsType, dataIds, run)
358 return newButler
361def buildExecutionButler(
362 butler: Butler,
363 graph: QuantumGraph,
364 outputLocation: ResourcePathExpression,
365 run: Optional[str],
366 *,
367 clobber: bool = False,
368 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
369 collections: Optional[Iterable[str]] = None,
370 datastoreRoot: Optional[ResourcePathExpression] = None,
371 transfer: str = "auto",
372) -> Butler:
373 r"""buildExecutionButler is a function that is responsible for exporting
374 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
375 only contains datasets specified by the `QuantumGraph`. These datasets are
376 both those that already exist in the input `~lsst.daf.butler.Butler`, and
377 those that are expected to be produced during the execution of the
378 `QuantumGraph`.
380 Parameters
381 ----------
382 butler : `lsst.daf.butler.Bulter`
383 This is the existing `~lsst.daf.butler.Butler` instance from which
384 existing datasets will be exported. This should be the
385 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
386 that will be converted with this object.
387 graph : `QuantumGraph`
388 Graph containing nodes that are to be exported into an execution
389 butler
390 outputLocation : convertible to `ResourcePath`
391 URI Location at which the execution butler is to be exported. May be
392 specified as a string or a `ResourcePath` instance.
393 run : `str`, optional
394 The run collection that the exported datasets are to be placed in. If
395 None, the default value in registry.defaults will be used.
396 clobber : `bool`, Optional
397 By default a butler will not be created if a file or directory
398 already exists at the output location. If this is set to `True`
399 what is at the location will be deleted prior to running the
400 export. Defaults to `False`
401 butlerModifier : `~typing.Callable`, Optional
402 If supplied this should be a callable that accepts a
403 `~lsst.daf.butler.Butler`, and returns an instantiated
404 `~lsst.daf.butler.Butler`. This callable may be used to make any
405 modifications to the `~lsst.daf.butler.Butler` desired. This
406 will be called after importing all datasets that exist in the input
407 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
408 to be produced. Examples of what this method could do include
409 things such as creating collections/runs/ etc.
410 collections : `~typing.Iterable` of `str`, Optional
411 An iterable of collection names that will be exported from the input
412 `~lsst.daf.butler.Butler` when creating the execution butler. If not
413 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
414 default collections will be used.
415 datastoreRoot : convertible to `ResourcePath`, Optional
416 Root directory for datastore of execution butler. If `None`, then the
417 original butler's datastore will be used.
418 transfer : `str`
419 How (and whether) the input datasets should be added to the execution
420 butler datastore. This should be a ``transfer`` string recognized by
421 :func:`lsst.resources.ResourcePath.transfer_from`.
422 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified.
424 Returns
425 -------
426 executionButler : `lsst.daf.butler.Butler`
427 An instance of the newly created execution butler
429 Raises
430 ------
431 FileExistsError
432 Raised if something exists in the filesystem at the specified output
433 location and clobber is `False`
434 NotADirectoryError
435 Raised if specified output URI does not correspond to a directory
436 """
437 # We know this must refer to a directory.
438 outputLocation = ResourcePath(outputLocation, forceDirectory=True)
439 if datastoreRoot is not None:
440 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True)
442 # Do this first to Fail Fast if the output exists
443 if (dirExists := outputLocation.exists()) and not clobber:
444 raise FileExistsError("Cannot create a butler at specified location, location exists")
445 if not outputLocation.isdir():
446 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
448 # Gather all DatasetTypes from the Python and check any that already exist
449 # in the registry for consistency. This does not check that all dataset
450 # types here exist, because they might want to register dataset types
451 # later. It would be nice to also check that, but to that we would need to
452 # be told whether they plan to register dataset types later (DM-30845).
453 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry)
455 exports, inserts = _accumulate(butler, graph, dataset_types)
456 yamlBuffer = _export(butler, collections, exports, inserts)
458 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot)
460 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier)
462 if transfer == "auto" and datastoreRoot is not None:
463 transfer = "copy"
465 # Transfer the existing datasets directly from the source butler.
466 newButler.transfer_from(
467 butler,
468 exports,
469 transfer=transfer,
470 skip_missing=False, # Everything should exist.
471 register_dataset_types=True,
472 transfer_dimensions=True,
473 )
475 return newButler