Coverage for python/lsst/pipe/base/executionButlerBuilder.py: 10%
145 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-04 05:01 -0700
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-04 05:01 -0700
1# This file is part of pipe_base.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21from __future__ import annotations
23__all__ = ("buildExecutionButler",)
25import io
26import itertools
27from collections import defaultdict
28from typing import Callable, DefaultDict, Iterable, List, Mapping, Optional, Set, Tuple, Union
30from lsst.daf.butler import Butler, Config, DataCoordinate, DatasetRef, DatasetType, Registry
31from lsst.daf.butler.core.repoRelocation import BUTLER_ROOT_TAG
32from lsst.daf.butler.registry import ConflictingDefinitionError, MissingDatasetTypeError
33from lsst.daf.butler.transfers import RepoExportContext
34from lsst.resources import ResourcePath, ResourcePathExpression
35from lsst.utils.introspection import get_class_of
37from .graph import QuantumGraph
38from .pipeline import PipelineDatasetTypes
40DataSetTypeMap = Mapping[DatasetType, Set[DataCoordinate]]
43def _validate_dataset_type(
44 candidate: DatasetType, previous: dict[Union[str, DatasetType], DatasetType], registry: Registry
45) -> DatasetType:
46 """Check the dataset types and return a consistent variant if there are
47 different compatible options.
49 Parameters
50 ----------
51 candidate : `lsst.daf.butler.DatasetType`
52 The candidate dataset type.
53 previous : `dict` [Union[`str`, `DatasetType`], `DatasetType`]
54 Previous dataset types found, indexed by name and also by
55 dataset type. The latter provides a quick way of returning a
56 previously checked dataset type.
57 registry : `lsst.daf.butler.Registry`
58 Main registry whose dataset type registration should override the
59 given one if it exists.
61 Returns
62 -------
63 datasetType : `lsst.daf.butler.DatasetType`
64 The dataset type to be used. This can be different from the
65 given ``candidate`` if a previous dataset type was encountered
66 with the same name and this one is compatible with it.
68 Raises
69 ------
70 ConflictingDefinitionError
71 Raised if a candidate dataset type has the same name as one
72 previously encountered but is not compatible with it.
74 Notes
75 -----
76 This function ensures that if a dataset type is given that has the
77 same name as a previously encountered dataset type but differs solely
78 in a way that is interchangeable (through a supported storage class)
79 then we will always return the first dataset type encountered instead
80 of the new variant. We assume that the butler will handle the
81 type conversion itself later.
82 """
83 # First check that if we have previously vetted this dataset type.
84 # Return the vetted form immediately if we have.
85 checked = previous.get(candidate)
86 if checked:
87 return checked
89 # Have not previously encountered this dataset type.
90 name = candidate.name
91 if prevDsType := previous.get(name):
92 # Check compatibility. For now assume both directions have to
93 # be acceptable.
94 if prevDsType.is_compatible_with(candidate) and candidate.is_compatible_with(prevDsType):
95 # Ensure that if this dataset type is used again we will return
96 # the version that we were first given with this name. Store
97 # it for next time and return the previous one.
98 previous[candidate] = prevDsType
99 return prevDsType
100 else:
101 raise ConflictingDefinitionError(
102 f"Dataset type incompatibility in graph: {prevDsType} not compatible with {candidate}"
103 )
105 # We haven't seen this dataset type in this graph before, but it may
106 # already be in the registry.
107 try:
108 registryDsType = registry.getDatasetType(name)
109 previous[candidate] = registryDsType
110 return registryDsType
111 except MissingDatasetTypeError:
112 pass
113 # Dataset type is totally new. Store it by name and by dataset type so
114 # it will be validated immediately next time it comes up.
115 previous[name] = candidate
116 previous[candidate] = candidate
117 return candidate
120def _accumulate(
121 butler: Butler,
122 graph: QuantumGraph,
123 dataset_types: PipelineDatasetTypes,
124) -> Tuple[Set[DatasetRef], DataSetTypeMap]:
125 # accumulate the DatasetRefs that will be transferred to the execution
126 # registry
128 # exports holds all the existing data that will be migrated to the
129 # execution butler
130 exports: Set[DatasetRef] = set()
132 # inserts is the mapping of DatasetType to dataIds for what is to be
133 # inserted into the registry. These are the products that are expected
134 # to be produced during processing of the QuantumGraph
135 inserts: DefaultDict[DatasetType, Set[DataCoordinate]] = defaultdict(set)
137 # It is possible to end up with a graph that has different storage
138 # classes attached to the same dataset type name. This is okay but
139 # must we must ensure that only a single dataset type definition is
140 # accumulated in the loop below. This data structure caches every dataset
141 # type encountered and stores the compatible alternative.
142 datasetTypes: dict[Union[str, DatasetType], DatasetType] = {}
144 # Add inserts for initOutputs (including initIntermediates); these are
145 # defined fully by their DatasetType, because they have no dimensions.
146 # initInputs are part of Quantum and that's the only place the graph stores
147 # the dataset IDs, so we process them there even though each Quantum for a
148 # task has the same ones.
149 for dataset_type in itertools.chain(dataset_types.initIntermediates, dataset_types.initOutputs):
150 if dataset_type.component() is not None:
151 dataset_type = dataset_type.makeCompositeDatasetType()
152 dataset_type = _validate_dataset_type(dataset_type, datasetTypes, butler.registry)
153 inserts[dataset_type].add(DataCoordinate.makeEmpty(dataset_type.dimensions.universe))
155 # Output references may be resolved even if they do not exist. Find all
156 # actually existing refs.
157 check_refs: Set[DatasetRef] = set()
158 for quantum in (n.quantum for n in graph):
159 for attrName in ("initInputs", "inputs", "outputs"):
160 attr: Mapping[DatasetType, Union[DatasetRef, List[DatasetRef]]] = getattr(quantum, attrName)
161 for type, refs in attr.items():
162 # This if block is because init inputs has a different
163 # signature for its items
164 if not isinstance(refs, list):
165 refs = [refs]
166 for ref in refs:
167 if ref.id is not None:
168 # We could check existence of individual components,
169 # but it should be less work to check their parent.
170 if ref.isComponent():
171 ref = ref.makeCompositeRef()
172 check_refs.add(ref)
173 exist_map = butler.datastore.knows_these(check_refs)
174 existing_ids = set(ref.id for ref, exists in exist_map.items() if exists)
175 del exist_map
177 for quantum in (n.quantum for n in graph):
178 for attrName in ("initInputs", "inputs", "outputs"):
179 attr = getattr(quantum, attrName)
181 for type, refs in attr.items():
182 if not isinstance(refs, list):
183 refs = [refs]
184 if type.component() is not None:
185 type = type.makeCompositeDatasetType()
186 type = _validate_dataset_type(type, datasetTypes, butler.registry)
187 # iterate over all the references, if it exists and should be
188 # exported, if not it should be inserted into the new registry
189 for ref in refs:
190 # Component dataset ID is the same as its parent ID, so
191 # checking component in existing_ids works OK.
192 if ref.id is not None and ref.id in existing_ids:
193 # If this is a component we want the composite to be
194 # exported.
195 if ref.isComponent():
196 ref = ref.makeCompositeRef()
197 # Make sure we export this with the registry's dataset
198 # type, since transfer_from doesn't handle storage
199 # class differences (maybe it should, but it's not
200 # bad to be defensive here even if that changes).
201 if type != ref.datasetType:
202 ref = ref.overrideStorageClass(type.storageClass)
203 assert ref.datasetType == type, "Dataset types should not differ in other ways."
204 exports.add(ref)
205 else:
206 if ref.isComponent():
207 # We can't insert a component, and a component will
208 # be part of some other upstream dataset, so it
209 # should be safe to skip them here
210 continue
211 inserts[type].add(ref.dataId)
212 return exports, inserts
215def _discoverCollections(butler: Butler, collections: Iterable[str]) -> set[str]:
216 # Recurse through any discovered collections to make sure all collections
217 # are exported. This exists because I ran into a situation where some
218 # collections were not properly being discovered and exported. This
219 # method may be able to be removed in the future if collection export
220 # logic changes
221 collections = set(collections)
222 while True:
223 discoveredCollections = set(
224 butler.registry.queryCollections(collections, flattenChains=True, includeChains=True)
225 )
226 if len(discoveredCollections) > len(collections):
227 collections = discoveredCollections
228 else:
229 break
230 return collections
233def _export(butler: Butler, collections: Optional[Iterable[str]], inserts: DataSetTypeMap) -> io.StringIO:
234 # This exports relevant dimension records and collections using daf butler
235 # objects, however it reaches in deep and does not use the public methods
236 # so that it can export it to a string buffer and skip disk access. This
237 # does not export the datasets themselves, since we use transfer_from for
238 # that.
239 yamlBuffer = io.StringIO()
240 # Yaml is hard coded, since the class controls both ends of the
241 # export/import
242 BackendClass = get_class_of(butler._config["repo_transfer_formats", "yaml", "export"])
243 backend = BackendClass(yamlBuffer, universe=butler.registry.dimensions)
244 exporter = RepoExportContext(butler.registry, butler.datastore, backend, directory=None, transfer=None)
246 # Need to ensure that the dimension records for outputs are
247 # transferred.
248 for _, dataIds in inserts.items():
249 exporter.saveDataIds(dataIds)
251 # Look for any defined collection, if not get the defaults
252 if collections is None:
253 collections = butler.registry.defaults.collections
255 # look up all collections associated with those inputs, this follows
256 # all chains to make sure everything is properly exported
257 for c in _discoverCollections(butler, collections):
258 exporter.saveCollection(c)
259 exporter._finish()
261 # reset the string buffer to the beginning so the read operation will
262 # actually *see* the data that was exported
263 yamlBuffer.seek(0)
264 return yamlBuffer
267def _setupNewButler(
268 butler: Butler,
269 outputLocation: ResourcePath,
270 dirExists: bool,
271 datastoreRoot: Optional[ResourcePath] = None,
272) -> Butler:
273 """Set up the execution butler
275 Parameters
276 ----------
277 butler : `Butler`
278 The original butler, upon which the execution butler is based.
279 outputLocation : `ResourcePath`
280 Location of the execution butler.
281 dirExists : `bool`
282 Does the ``outputLocation`` exist, and if so, should it be clobbered?
283 datastoreRoot : `ResourcePath`, optional
284 Path for the execution butler datastore. If not specified, then the
285 original butler's datastore will be used.
287 Returns
288 -------
289 execution_butler : `Butler`
290 Execution butler.
291 """
292 # Set up the new butler object at the specified location
293 if dirExists:
294 # Remove the existing table, if the code got this far and this exists
295 # clobber must be true
296 executionRegistry = outputLocation.join("gen3.sqlite3")
297 if executionRegistry.exists():
298 executionRegistry.remove()
299 else:
300 outputLocation.mkdir()
302 # Copy the existing butler config, modifying the location of the
303 # registry to the specified location.
304 # Preserve the root path from the existing butler so things like
305 # file data stores continue to look at the old location.
306 config = Config(butler._config)
307 config["root"] = outputLocation.geturl()
308 config["allow_put_of_predefined_dataset"] = True
309 config["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
311 # Remove any namespace that may be set in main registry.
312 config.pop(("registry", "namespace"), None)
314 # Obscore manager cannot be used with execution butler.
315 config.pop(("registry", "managers", "obscore"), None)
317 # record the current root of the datastore if it is specified relative
318 # to the butler root
319 if datastoreRoot is not None:
320 config["datastore", "root"] = datastoreRoot.geturl()
321 elif config.get(("datastore", "root")) == BUTLER_ROOT_TAG and butler._config.configDir is not None:
322 config["datastore", "root"] = butler._config.configDir.geturl()
323 config["datastore", "trust_get_request"] = True
325 # Requires that we use the dimension configuration from the original
326 # butler and not use the defaults.
327 config = Butler.makeRepo(
328 root=outputLocation,
329 config=config,
330 dimensionConfig=butler.registry.dimensions.dimensionConfig,
331 overwrite=True,
332 forceConfigRoot=False,
333 )
335 # Return a newly created butler
336 return Butler(config, writeable=True)
339def _import(
340 yamlBuffer: io.StringIO,
341 newButler: Butler,
342 inserts: DataSetTypeMap,
343 run: Optional[str],
344 butlerModifier: Optional[Callable[[Butler], Butler]],
345) -> Butler:
346 # This method takes the exports from the existing butler, imports
347 # them into the newly created butler, and then inserts the datasets
348 # that are expected to be produced.
350 # import the existing datasets using "split" mode. "split" is safe
351 # because execution butler is assumed to be able to see all the file
352 # locations that the main datastore can see. "split" supports some
353 # absolute URIs in the datastore.
354 newButler.import_(filename=yamlBuffer, format="yaml", reuseIds=True, transfer="split")
356 # If there is modifier callable, run it to make necessary updates
357 # to the new butler.
358 if butlerModifier is not None:
359 newButler = butlerModifier(newButler)
361 # Register datasets to be produced and insert them into the registry
362 for dsType, dataIds in inserts.items():
363 # Storage class differences should have already been resolved by calls
364 # _validate_dataset_type in _export, resulting in the Registry dataset
365 # type whenever that exists.
366 newButler.registry.registerDatasetType(dsType)
367 newButler.registry.insertDatasets(dsType, dataIds, run)
369 return newButler
372def buildExecutionButler(
373 butler: Butler,
374 graph: QuantumGraph,
375 outputLocation: ResourcePathExpression,
376 run: Optional[str],
377 *,
378 clobber: bool = False,
379 butlerModifier: Optional[Callable[[Butler], Butler]] = None,
380 collections: Optional[Iterable[str]] = None,
381 datastoreRoot: Optional[ResourcePathExpression] = None,
382 transfer: str = "auto",
383) -> Butler:
384 r"""buildExecutionButler is a function that is responsible for exporting
385 input `QuantumGraphs` into a new minimal `~lsst.daf.butler.Butler` which
386 only contains datasets specified by the `QuantumGraph`. These datasets are
387 both those that already exist in the input `~lsst.daf.butler.Butler`, and
388 those that are expected to be produced during the execution of the
389 `QuantumGraph`.
391 Parameters
392 ----------
393 butler : `lsst.daf.butler.Bulter`
394 This is the existing `~lsst.daf.butler.Butler` instance from which
395 existing datasets will be exported. This should be the
396 `~lsst.daf.butler.Butler` which was used to create any `QuantumGraphs`
397 that will be converted with this object.
398 graph : `QuantumGraph`
399 Graph containing nodes that are to be exported into an execution
400 butler
401 outputLocation : convertible to `ResourcePath`
402 URI Location at which the execution butler is to be exported. May be
403 specified as a string or a `ResourcePath` instance.
404 run : `str`, optional
405 The run collection that the exported datasets are to be placed in. If
406 None, the default value in registry.defaults will be used.
407 clobber : `bool`, Optional
408 By default a butler will not be created if a file or directory
409 already exists at the output location. If this is set to `True`
410 what is at the location will be deleted prior to running the
411 export. Defaults to `False`
412 butlerModifier : `~typing.Callable`, Optional
413 If supplied this should be a callable that accepts a
414 `~lsst.daf.butler.Butler`, and returns an instantiated
415 `~lsst.daf.butler.Butler`. This callable may be used to make any
416 modifications to the `~lsst.daf.butler.Butler` desired. This
417 will be called after importing all datasets that exist in the input
418 `~lsst.daf.butler.Butler` but prior to inserting Datasets expected
419 to be produced. Examples of what this method could do include
420 things such as creating collections/runs/ etc.
421 collections : `~typing.Iterable` of `str`, Optional
422 An iterable of collection names that will be exported from the input
423 `~lsst.daf.butler.Butler` when creating the execution butler. If not
424 supplied the `~lsst.daf.butler.Butler`\ 's `~lsst.daf.butler.Registry`
425 default collections will be used.
426 datastoreRoot : convertible to `ResourcePath`, Optional
427 Root directory for datastore of execution butler. If `None`, then the
428 original butler's datastore will be used.
429 transfer : `str`
430 How (and whether) the input datasets should be added to the execution
431 butler datastore. This should be a ``transfer`` string recognized by
432 :func:`lsst.resources.ResourcePath.transfer_from`.
433 ``"auto"`` means to ``"copy"`` if the ``datastoreRoot`` is specified.
435 Returns
436 -------
437 executionButler : `lsst.daf.butler.Butler`
438 An instance of the newly created execution butler
440 Raises
441 ------
442 FileExistsError
443 Raised if something exists in the filesystem at the specified output
444 location and clobber is `False`
445 NotADirectoryError
446 Raised if specified output URI does not correspond to a directory
447 """
448 # We know this must refer to a directory.
449 outputLocation = ResourcePath(outputLocation, forceDirectory=True)
450 if datastoreRoot is not None:
451 datastoreRoot = ResourcePath(datastoreRoot, forceDirectory=True)
453 # Do this first to Fail Fast if the output exists
454 if (dirExists := outputLocation.exists()) and not clobber:
455 raise FileExistsError("Cannot create a butler at specified location, location exists")
456 if not outputLocation.isdir():
457 raise NotADirectoryError("The specified output URI does not appear to correspond to a directory")
459 # Gather all DatasetTypes from the Python and check any that already exist
460 # in the registry for consistency. This does not check that all dataset
461 # types here exist, because they might want to register dataset types
462 # later. It would be nice to also check that, but to that we would need to
463 # be told whether they plan to register dataset types later (DM-30845).
464 dataset_types = PipelineDatasetTypes.fromPipeline(graph.iterTaskGraph(), registry=butler.registry)
466 exports, inserts = _accumulate(butler, graph, dataset_types)
467 yamlBuffer = _export(butler, collections, inserts)
469 newButler = _setupNewButler(butler, outputLocation, dirExists, datastoreRoot)
471 newButler = _import(yamlBuffer, newButler, inserts, run, butlerModifier)
473 if transfer == "auto" and datastoreRoot is not None:
474 transfer = "copy"
476 # Transfer the existing datasets directly from the source butler.
477 newButler.transfer_from(
478 butler,
479 exports,
480 transfer=transfer,
481 skip_missing=False, # Everything should exist.
482 register_dataset_types=True,
483 transfer_dimensions=True,
484 )
486 return newButler