Coverage for python/lsst/daf/butler/tests/_testRepo.py: 11%
141 statements
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-17 02:01 -0800
« prev ^ index » next coverage.py v6.5.0, created at 2022-11-17 02:01 -0800
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = [
25 "makeTestRepo",
26 "makeTestCollection",
27 "addDatasetType",
28 "expandUniqueId",
29 "DatastoreMock",
30 "addDataIdValue",
31]
33import random
34from typing import Any, Iterable, Mapping, Optional, Set, Tuple, Union
35from unittest.mock import MagicMock
37import sqlalchemy
38from lsst.daf.butler import (
39 Butler,
40 Config,
41 DataCoordinate,
42 DatasetRef,
43 DatasetType,
44 Dimension,
45 DimensionUniverse,
46 FileDataset,
47 StorageClass,
48)
51def makeTestRepo(
52 root: str, dataIds: Optional[Mapping[str, Iterable]] = None, *, config: Config = None, **kwargs
53) -> Butler:
54 """Create an empty test repository.
56 Parameters
57 ----------
58 root : `str`
59 The location of the root directory for the repository.
60 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional
61 A mapping keyed by the dimensions used in the test. Each value is an
62 iterable of names for that dimension (e.g., detector IDs for
63 `"detector"`). Related dimensions (e.g., instruments and detectors) are
64 linked arbitrarily, with values created for implied dimensions only
65 when needed. This parameter is provided for compatibility with old
66 code; newer code should make the repository, then call
67 `~lsst.daf.butler.tests.addDataIdValue`.
68 config : `lsst.daf.butler.Config`, optional
69 A configuration for the repository (for details, see
70 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository
71 with default dataset and storage types, but optimized for speed. The
72 defaults set ``.datastore.cls``, ``.datastore.checksum`` and
73 ``.registry.db``. If a supplied config does not specify these values
74 the internal defaults will be used to ensure that we have a usable
75 configuration.
76 **kwargs
77 Extra arguments to `lsst.daf.butler.Butler.makeRepo`.
79 Returns
80 -------
81 butler : `lsst.daf.butler.Butler`
82 A Butler referring to the new repository. This Butler is provided only
83 for additional setup; to keep test cases isolated, it is highly
84 recommended that each test create its own Butler with a unique
85 run/collection. See `makeTestCollection`.
87 Notes
88 -----
89 This function provides a "quick and dirty" repository for simple unit tests
90 that don't depend on complex data relationships. It is ill-suited for tests
91 where the structure of the data matters. If you need such a dataset, create
92 it directly or use a saved test dataset.
93 """
94 defaults = Config()
95 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore"
96 defaults["datastore", "checksum"] = False # In case of future changes
97 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
99 if config:
100 defaults.update(config)
102 if not dataIds:
103 dataIds = {}
105 # Disable config root by default so that our registry override will
106 # not be ignored.
107 # newConfig guards against location-related keywords like outfile
108 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs)
109 butler = Butler(newConfig, writeable=True)
110 dimensionRecords = _makeRecords(dataIds, butler.registry.dimensions)
111 for dimension, records in dimensionRecords.items():
112 if butler.registry.dimensions[dimension].viewOf is None:
113 butler.registry.insertDimensionData(dimension, *records)
114 return butler
117def makeTestCollection(repo: Butler, uniqueId: Optional[str] = None) -> Butler:
118 """Create a read/write Butler to a fresh collection.
120 Parameters
121 ----------
122 repo : `lsst.daf.butler.Butler`
123 A previously existing Butler to a repository, such as that returned by
124 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`.
125 uniqueId : `str`, optional
126 A collection ID guaranteed by external code to be unique across all
127 calls to ``makeTestCollection`` for the same repository.
129 Returns
130 -------
131 butler : `lsst.daf.butler.Butler`
132 A Butler referring to a new collection in the repository at ``root``.
133 The collection is (almost) guaranteed to be new.
135 Notes
136 -----
137 This function creates a single run collection that does not necessarily
138 conform to any repository conventions. It is only suitable for creating an
139 isolated test area, and not for repositories intended for real data
140 processing or analysis.
141 """
142 if not uniqueId:
143 # Create a "random" collection name
144 # Speed matters more than cryptographic guarantees
145 uniqueId = str(random.randrange(1_000_000_000))
146 collection = "test_" + uniqueId
147 return Butler(butler=repo, run=collection)
150def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]:
151 """Create cross-linked dimension records from a collection of
152 data ID values.
154 Parameters
155 ----------
156 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
157 A mapping keyed by the dimensions of interest. Each value is an
158 iterable of names for that dimension (e.g., detector IDs for
159 `"detector"`).
160 universe : lsst.daf.butler.DimensionUniverse
161 Set of all known dimensions and their relationships.
163 Returns
164 -------
165 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
166 A mapping keyed by the dimensions of interest, giving one
167 `~lsst.daf.butler.DimensionRecord` for each input name. Related
168 dimensions (e.g., instruments and detectors) are linked arbitrarily.
169 """
171 # Create values for all dimensions that are (recursive) required or implied
172 # dependencies of the given ones.
173 complete_data_id_values = {}
174 for dimension in universe.extract(dataIds.keys()):
175 if dimension.name in dataIds:
176 complete_data_id_values[dimension.name] = list(dataIds[dimension.name])
177 if dimension.name not in complete_data_id_values:
178 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)]
180 # Start populating dicts that will become DimensionRecords by providing
181 # alternate keys like detector names
182 record_dicts_by_dimension_name = {}
183 for name, values in complete_data_id_values.items():
184 record_dicts_by_dimension_name[name] = []
185 dimension = universe[name]
186 for value in values:
187 record_dicts_by_dimension_name[name].append(_fillAllKeys(dimension, value))
189 # Pick cross-relationships arbitrarily
190 for name, record_dicts in record_dicts_by_dimension_name.items():
191 dimension = universe[name]
192 for record_dict in record_dicts:
193 for other in dimension.dimensions:
194 if other != dimension:
195 relation = record_dicts_by_dimension_name[other.name][0]
196 record_dict[other.name] = relation[other.primaryKey.name]
198 return {
199 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts]
200 for dimension, record_dicts in record_dicts_by_dimension_name.items()
201 }
204def _fillAllKeys(dimension: Dimension, value: Union[str, int]) -> Mapping[str, Union[str, int]]:
205 """Create an arbitrary mapping of all required keys for a given dimension
206 that do not refer to other dimensions.
208 Parameters
209 ----------
210 dimension : `lsst.daf.butler.Dimension`
211 The dimension for which to generate a set of keys (e.g., detector).
212 value
213 The value assigned to ``dimension`` (e.g., detector ID).
215 Returns
216 -------
217 expandedValue : `dict` [`str`]
218 A mapping of dimension keys to values. ``dimension's`` primary key
219 maps to ``value``, but all other mappings (e.g., detector name)
220 are arbitrary.
221 """
222 expandedValue = {}
223 for key in dimension.uniqueKeys:
224 if key.nbytes:
225 # For `bytes` fields, we want something that casts at least `str`
226 # and `int` values to bytes and yields b'' when called with no
227 # arguments (as in the except block below). Unfortunately, the
228 # `bytes` type itself fails for both `str` and `int`, but this
229 # lambda does what we need. This particularly important for the
230 # skymap dimensions' bytes 'hash' field, which has a unique
231 # constraint; without this, all skymaps would get a hash of b''
232 # and end up conflicting.
233 castType = lambda *args: str(*args).encode() # noqa: E731
234 else:
235 castType = key.dtype().python_type
236 try:
237 castValue = castType(value)
238 except TypeError:
239 castValue = castType()
240 expandedValue[key.name] = castValue
241 for key in dimension.metadata:
242 if not key.nullable:
243 expandedValue[key.name] = key.dtype().python_type(value)
244 return expandedValue
247def _makeRandomDataIdValue(dimension: Dimension) -> Union[int, str]:
248 """Generate a random value of the appropriate type for a data ID key.
250 Parameters
251 ----------
252 dimension : `Dimension`
253 Dimension the value corresponds to.
255 Returns
256 -------
257 value : `int` or `str`
258 Random value.
259 """
260 if dimension.primaryKey.getPythonType() is str:
261 return str(random.randrange(1000))
262 else:
263 return random.randrange(1000)
266def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate:
267 """Return a complete data ID matching some criterion.
269 Parameters
270 ----------
271 butler : `lsst.daf.butler.Butler`
272 The repository to query.
273 partialId : `~collections.abc.Mapping` [`str`]
274 A mapping of known dimensions and values.
276 Returns
277 -------
278 dataId : `lsst.daf.butler.DataCoordinate`
279 The unique data ID that matches ``partialId``.
281 Raises
282 ------
283 ValueError
284 Raised if ``partialId`` does not uniquely identify a data ID.
286 Notes
287 -----
288 This method will only work correctly if all dimensions attached to the
289 target dimension (eg., "physical_filter" for "visit") are known to the
290 repository, even if they're not needed to identify a dataset. This function
291 is only suitable for certain kinds of test repositories, and not for
292 repositories intended for real data processing or analysis.
294 Examples
295 --------
296 .. code-block:: py
298 >>> butler = makeTestRepo(
299 "testdir", {"instrument": ["notACam"], "detector": [1]})
300 >>> expandUniqueId(butler, {"detector": 1})
301 DataCoordinate({instrument, detector}, ('notACam', 1))
302 """
303 # The example is *not* a doctest because it requires dangerous I/O
304 registry = butler.registry
305 dimensions = registry.dimensions.extract(partialId.keys()).required
307 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items())
309 # Much of the purpose of this function is to do something we explicitly
310 # reject most of the time: query for a governor dimension (e.g. instrument)
311 # given something that depends on it (e.g. visit), hence check=False.
312 dataId = list(registry.queryDataIds(dimensions, where=query, check=False))
313 if len(dataId) == 1:
314 return dataId[0]
315 else:
316 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.")
319def _findOrInventDataIdValue(
320 butler: Butler, data_id: dict[str, Union[str, int]], dimension: Dimension
321) -> tuple[Union[str, int], bool]:
322 """Look up an arbitrary value for a dimension that is consistent with a
323 partial data ID that does not specify that dimension, or invent one if no
324 such value exists.
326 Parameters
327 ----------
328 butler : `Butler`
329 Butler to use to look up data ID values.
330 data_id : `dict` [ `str`, `str` or `int` ]
331 Dictionary of possibly-related data ID values.
332 dimension : `Dimension`
333 Dimension to obtain a value for.
335 Returns
336 -------
337 value : `int` or `str`
338 Value for this dimension.
339 invented : `bool`
340 `True` if the value had to be invented, `False` if a compatible value
341 already existed.
342 """
343 # No values given by caller for this dimension. See if any exist
344 # in the registry that are consistent with the values of dimensions
345 # we do have:
346 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names}
347 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1))
348 if not matches:
349 # Nothing in the registry matches: invent a data ID value
350 # with the right type (actual value does not matter).
351 # We may or may not actually make a record with this; that's
352 # easier to check later.
353 dimension_value = _makeRandomDataIdValue(dimension)
354 return dimension_value, True
355 else:
356 # A record does exist in the registry. Use its data ID value.
357 return matches[0].dataId[dimension.name], False
360def _makeDimensionRecordDict(data_id: dict[str, Union[str, int]], dimension: Dimension) -> dict[str, Any]:
361 """Create a dictionary that can be used to build a `DimensionRecord` that
362 is consistent with the given data ID.
364 Parameters
365 ----------
366 data_id : `dict` [ `str`, `str` or `int` ]
367 Dictionary that contains values for at least all of
368 ``dimension.dimensions.names`` (the main dimension, its recursive
369 required dependencies, and its non-recursive implied dependencies).
370 dimension : `Dimension`
371 Dimension to build a record dictionary for.
373 Returns
374 -------
375 record_dict : `dict` [ `str`, `object` ]
376 Dictionary that can be passed as ``**kwargs`` to this dimensions
377 record class constructor.
378 """
379 # Add the primary key field for this dimension.
380 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]}
381 # Define secondary keys (e.g., detector name given detector id)
382 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name]))
383 # Set the foreign key values for any related dimensions that should
384 # appear in the record.
385 for related_dimension in dimension.dimensions:
386 if related_dimension.name != dimension.name:
387 record_dict[related_dimension.name] = data_id[related_dimension.name]
388 return record_dict
391def addDataIdValue(butler: Butler, dimension: str, value: Union[str, int], **related: Union[str, int]):
392 """Add the records that back a new data ID to a repository.
394 Parameters
395 ----------
396 butler : `lsst.daf.butler.Butler`
397 The repository to update.
398 dimension : `str`
399 The name of the dimension to gain a new value.
400 value
401 The value to register for the dimension.
402 **related
403 Any existing dimensions to be linked to ``value``.
405 Notes
406 -----
407 Related dimensions (e.g., the instrument associated with a detector) may be
408 specified using ``related``, which requires a value for those dimensions to
409 have been added to the repository already (generally with a previous call
410 to `addDataIdValue`. Any dependencies of the given dimension that are not
411 included in ``related`` will be linked to existing values arbitrarily, and
412 (for implied dependencies only) created and also inserted into the registry
413 if they do not exist. Values for required dimensions and those given in
414 ``related`` are never created.
416 Because this function creates filler data, it is only suitable for test
417 repositories. It should not be used for repositories intended for real data
418 processing or analysis, which have known dimension values.
420 Examples
421 --------
422 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples.
423 """
424 # Example is not doctest, because it's probably unsafe to create even an
425 # in-memory butler in that environment.
426 try:
427 fullDimension = butler.registry.dimensions[dimension]
428 except KeyError as e:
429 raise ValueError from e
430 # Bad keys ignored by registry code
431 extraKeys = related.keys() - fullDimension.graph.dimensions.names
432 if extraKeys:
433 raise ValueError(
434 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}"
435 )
437 # Assemble a dictionary data ID holding the given primary dimension value
438 # and all of the related ones.
439 data_id: dict[str, Union[int, str]] = {dimension: value}
440 data_id.update(related)
442 # Compute the set of all dimensions that these recursively depend on.
443 all_dimensions = butler.registry.dimensions.extract(data_id.keys())
445 # Create dicts that will become DimensionRecords for all of these data IDs.
446 # This iteration is guaranteed to be in topological order, so we can count
447 # on new data ID values being invented before they are needed.
448 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {}
449 for dimension_obj in all_dimensions:
450 dimension_value = data_id.get(dimension_obj.name)
451 if dimension_value is None:
452 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj)
453 if not invented:
454 # No need to make a new record; one already exists.
455 continue
456 if dimension_obj.name in related:
457 # Caller passed in a value of this dimension explicitly, but it
458 # isn't the primary dimension they asked to have a record created
459 # for. That means they expect this record to already exist.
460 continue
461 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required:
462 # We also don't want to automatically create new dimension records
463 # for required dimensions (except for the main dimension the caller
464 # asked for); those are also asserted by the caller to already
465 # exist.
466 continue
467 if dimension_obj.viewOf is not None:
468 # Don't need to bother generating full records for dimensions whose
469 # records are just a view into some other's records anyway.
470 continue
471 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj)
473 # Sync those dimension record dictionaries with the database.
474 for dimension_obj, record_dict in record_dicts_by_dimension.items():
475 record = dimension_obj.RecordClass(**record_dict)
476 try:
477 butler.registry.syncDimensionData(dimension_obj, record)
478 except sqlalchemy.exc.IntegrityError as e:
479 raise RuntimeError(
480 "Could not create data ID value. Automatic relationship generation "
481 "may have failed; try adding keywords to assign a specific instrument, "
482 "physical_filter, etc. based on the nested exception message."
483 ) from e
486def addDatasetType(butler: Butler, name: str, dimensions: Set[str], storageClass: str) -> DatasetType:
487 """Add a new dataset type to a repository.
489 Parameters
490 ----------
491 butler : `lsst.daf.butler.Butler`
492 The repository to update.
493 name : `str`
494 The name of the dataset type.
495 dimensions : `set` [`str`]
496 The dimensions of the new dataset type.
497 storageClass : `str`
498 The storage class the dataset will use.
500 Returns
501 -------
502 datasetType : `lsst.daf.butler.DatasetType`
503 The new type.
505 Raises
506 ------
507 ValueError
508 Raised if the dimensions or storage class is invalid.
510 Notes
511 -----
512 Dataset types are shared across all collections in a repository, so this
513 function does not need to be run for each collection.
514 """
515 try:
516 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.registry.dimensions)
517 butler.registry.registerDatasetType(datasetType)
518 return datasetType
519 except KeyError as e:
520 raise ValueError from e
523class DatastoreMock:
524 """Mocks a butler datastore.
526 Has functions that mock the datastore in a butler. Provides an `apply`
527 function to replace the relevent butler datastore functions with the mock
528 functions.
529 """
531 @staticmethod
532 def apply(butler):
533 """Apply datastore mocks to a butler."""
534 butler.datastore.export = DatastoreMock._mock_export
535 butler.datastore.get = DatastoreMock._mock_get
536 butler.datastore.ingest = MagicMock()
538 @staticmethod
539 def _mock_export(
540 refs: Iterable[DatasetRef], *, directory: Optional[str] = None, transfer: Optional[str] = None
541 ) -> Iterable[FileDataset]:
542 """A mock of `Datastore.export` that satisfies the requirement that
543 the refs passed in are included in the `FileDataset` objects
544 returned.
546 This can be used to construct a `Datastore` mock that can be used
547 in repository export via::
549 datastore = unittest.mock.Mock(spec=Datastore)
550 datastore.export = DatastoreMock._mock_export
552 """
553 for ref in refs:
554 yield FileDataset(
555 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter"
556 )
558 @staticmethod
559 def _mock_get(
560 ref: DatasetRef,
561 parameters: Optional[Mapping[str, Any]] = None,
562 storageClass: Optional[Union[StorageClass, str]] = None,
563 ) -> Tuple[int, Optional[Mapping[str, Any]]]:
564 """A mock of `Datastore.get` that just returns the integer dataset ID
565 value and parameters it was given.
566 """
567 return (ref.id, parameters)