Coverage for python/lsst/daf/butler/tests/_testRepo.py: 12%
144 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-28 10:10 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
22from __future__ import annotations
24__all__ = [
25 "makeTestRepo",
26 "makeTestCollection",
27 "addDatasetType",
28 "expandUniqueId",
29 "DatastoreMock",
30 "addDataIdValue",
31]
33import random
34from collections.abc import Iterable, Mapping
35from typing import TYPE_CHECKING, Any
36from unittest.mock import MagicMock
38import sqlalchemy
39from lsst.daf.butler import (
40 Butler,
41 Config,
42 DataCoordinate,
43 DatasetRef,
44 DatasetType,
45 Dimension,
46 DimensionUniverse,
47 FileDataset,
48 StorageClass,
49)
51if TYPE_CHECKING:
52 from lsst.daf.butler import DatasetId
55def makeTestRepo(
56 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any
57) -> Butler:
58 """Create an empty test repository.
60 Parameters
61 ----------
62 root : `str`
63 The location of the root directory for the repository.
64 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional
65 A mapping keyed by the dimensions used in the test. Each value is an
66 iterable of names for that dimension (e.g., detector IDs for
67 `"detector"`). Related dimensions (e.g., instruments and detectors) are
68 linked arbitrarily, with values created for implied dimensions only
69 when needed. This parameter is provided for compatibility with old
70 code; newer code should make the repository, then call
71 `~lsst.daf.butler.tests.addDataIdValue`.
72 config : `lsst.daf.butler.Config`, optional
73 A configuration for the repository (for details, see
74 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository
75 with default dataset and storage types, but optimized for speed. The
76 defaults set ``.datastore.cls``, ``.datastore.checksum`` and
77 ``.registry.db``. If a supplied config does not specify these values
78 the internal defaults will be used to ensure that we have a usable
79 configuration.
80 **kwargs
81 Extra arguments to `lsst.daf.butler.Butler.makeRepo`.
83 Returns
84 -------
85 butler : `lsst.daf.butler.Butler`
86 A Butler referring to the new repository. This Butler is provided only
87 for additional setup; to keep test cases isolated, it is highly
88 recommended that each test create its own Butler with a unique
89 run/collection. See `makeTestCollection`.
91 Notes
92 -----
93 This function provides a "quick and dirty" repository for simple unit tests
94 that don't depend on complex data relationships. It is ill-suited for tests
95 where the structure of the data matters. If you need such a dataset, create
96 it directly or use a saved test dataset.
97 """
98 defaults = Config()
99 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore"
100 defaults["datastore", "checksum"] = False # In case of future changes
101 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
103 if config:
104 defaults.update(config)
106 if not dataIds:
107 dataIds = {}
109 # Disable config root by default so that our registry override will
110 # not be ignored.
111 # newConfig guards against location-related keywords like outfile
112 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs)
113 butler = Butler(newConfig, writeable=True)
114 dimensionRecords = _makeRecords(dataIds, butler.dimensions)
115 for dimension, records in dimensionRecords.items():
116 if butler.dimensions[dimension].viewOf is None:
117 butler.registry.insertDimensionData(dimension, *records)
118 return butler
121def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler:
122 """Create a read/write Butler to a fresh collection.
124 Parameters
125 ----------
126 repo : `lsst.daf.butler.Butler`
127 A previously existing Butler to a repository, such as that returned by
128 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`.
129 uniqueId : `str`, optional
130 A collection ID guaranteed by external code to be unique across all
131 calls to ``makeTestCollection`` for the same repository.
133 Returns
134 -------
135 butler : `lsst.daf.butler.Butler`
136 A Butler referring to a new collection in the repository at ``root``.
137 The collection is (almost) guaranteed to be new.
139 Notes
140 -----
141 This function creates a single run collection that does not necessarily
142 conform to any repository conventions. It is only suitable for creating an
143 isolated test area, and not for repositories intended for real data
144 processing or analysis.
145 """
146 if not uniqueId:
147 # Create a "random" collection name
148 # Speed matters more than cryptographic guarantees
149 uniqueId = str(random.randrange(1_000_000_000))
150 collection = "test_" + uniqueId
151 return Butler(butler=repo, run=collection)
154def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]:
155 """Create cross-linked dimension records from a collection of
156 data ID values.
158 Parameters
159 ----------
160 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
161 A mapping keyed by the dimensions of interest. Each value is an
162 iterable of names for that dimension (e.g., detector IDs for
163 `"detector"`).
164 universe : lsst.daf.butler.DimensionUniverse
165 Set of all known dimensions and their relationships.
167 Returns
168 -------
169 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
170 A mapping keyed by the dimensions of interest, giving one
171 `~lsst.daf.butler.DimensionRecord` for each input name. Related
172 dimensions (e.g., instruments and detectors) are linked arbitrarily.
173 """
174 # Create values for all dimensions that are (recursive) required or implied
175 # dependencies of the given ones.
176 complete_data_id_values = {}
177 for dimension in universe.extract(dataIds.keys()):
178 if dimension.name in dataIds:
179 complete_data_id_values[dimension.name] = list(dataIds[dimension.name])
180 if dimension.name not in complete_data_id_values:
181 complete_data_id_values[dimension.name] = [_makeRandomDataIdValue(dimension)]
183 # Start populating dicts that will become DimensionRecords by providing
184 # alternate keys like detector names
185 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {}
186 for name, values in complete_data_id_values.items():
187 record_dicts_by_dimension_name[name] = []
188 dimension_el = universe[name]
189 for value in values:
190 # _fillAllKeys wants Dimension and not DimensionElement.
191 # universe.__getitem__ says it returns DimensionElement but this
192 # really does also seem to be a Dimension here.
193 record_dicts_by_dimension_name[name].append(
194 _fillAllKeys(dimension_el, value) # type: ignore[arg-type]
195 )
197 # Pick cross-relationships arbitrarily
198 for name, record_dicts in record_dicts_by_dimension_name.items():
199 dimension_el = universe[name]
200 for record_dict in record_dicts:
201 for other in dimension_el.dimensions:
202 if other != dimension_el:
203 relation = record_dicts_by_dimension_name[other.name][0]
204 record_dict[other.name] = relation[other.primaryKey.name]
206 return {
207 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts]
208 for dimension, record_dicts in record_dicts_by_dimension_name.items()
209 }
212def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]:
213 """Create an arbitrary mapping of all required keys for a given dimension
214 that do not refer to other dimensions.
216 Parameters
217 ----------
218 dimension : `lsst.daf.butler.Dimension`
219 The dimension for which to generate a set of keys (e.g., detector).
220 value
221 The value assigned to ``dimension`` (e.g., detector ID).
223 Returns
224 -------
225 expandedValue : `dict` [`str`]
226 A mapping of dimension keys to values. ``dimension's`` primary key
227 maps to ``value``, but all other mappings (e.g., detector name)
228 are arbitrary.
229 """
230 expandedValue: dict[str, str | int | bytes] = {}
231 for key in dimension.uniqueKeys:
232 if key.nbytes:
233 # For `bytes` fields, we want something that casts at least `str`
234 # and `int` values to bytes and yields b'' when called with no
235 # arguments (as in the except block below). Unfortunately, the
236 # `bytes` type itself fails for both `str` and `int`, but this
237 # lambda does what we need. This particularly important for the
238 # skymap dimensions' bytes 'hash' field, which has a unique
239 # constraint; without this, all skymaps would get a hash of b''
240 # and end up conflicting.
241 castType = lambda *args: str(*args).encode() # noqa: E731
242 else:
243 castType = key.dtype().python_type
244 try:
245 castValue = castType(value)
246 except TypeError:
247 castValue = castType()
248 expandedValue[key.name] = castValue
249 for key in dimension.metadata:
250 if not key.nullable:
251 expandedValue[key.name] = key.dtype().python_type(value)
252 return expandedValue
255def _makeRandomDataIdValue(dimension: Dimension) -> int | str:
256 """Generate a random value of the appropriate type for a data ID key.
258 Parameters
259 ----------
260 dimension : `Dimension`
261 Dimension the value corresponds to.
263 Returns
264 -------
265 value : `int` or `str`
266 Random value.
267 """
268 if dimension.primaryKey.getPythonType() is str:
269 return str(random.randrange(1000))
270 else:
271 return random.randrange(1000)
274def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate:
275 """Return a complete data ID matching some criterion.
277 Parameters
278 ----------
279 butler : `lsst.daf.butler.Butler`
280 The repository to query.
281 partialId : `~collections.abc.Mapping` [`str`]
282 A mapping of known dimensions and values.
284 Returns
285 -------
286 dataId : `lsst.daf.butler.DataCoordinate`
287 The unique data ID that matches ``partialId``.
289 Raises
290 ------
291 ValueError
292 Raised if ``partialId`` does not uniquely identify a data ID.
294 Notes
295 -----
296 This method will only work correctly if all dimensions attached to the
297 target dimension (eg., "physical_filter" for "visit") are known to the
298 repository, even if they're not needed to identify a dataset. This function
299 is only suitable for certain kinds of test repositories, and not for
300 repositories intended for real data processing or analysis.
302 Examples
303 --------
304 .. code-block:: py
306 >>> butler = makeTestRepo(
307 "testdir", {"instrument": ["notACam"], "detector": [1]})
308 >>> expandUniqueId(butler, {"detector": 1})
309 DataCoordinate({instrument, detector}, ('notACam', 1))
310 """
311 # The example is *not* a doctest because it requires dangerous I/O
312 registry = butler.registry
313 dimensions = registry.dimensions.extract(partialId.keys()).required
315 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items())
317 # Much of the purpose of this function is to do something we explicitly
318 # reject most of the time: query for a governor dimension (e.g. instrument)
319 # given something that depends on it (e.g. visit), hence check=False.
320 dataId = list(registry.queryDataIds(dimensions, where=query, check=False))
321 if len(dataId) == 1:
322 return dataId[0]
323 else:
324 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.")
327def _findOrInventDataIdValue(
328 butler: Butler, data_id: dict[str, str | int], dimension: Dimension
329) -> tuple[str | int, bool]:
330 """Look up an arbitrary value for a dimension that is consistent with a
331 partial data ID that does not specify that dimension, or invent one if no
332 such value exists.
334 Parameters
335 ----------
336 butler : `Butler`
337 Butler to use to look up data ID values.
338 data_id : `dict` [ `str`, `str` or `int` ]
339 Dictionary of possibly-related data ID values.
340 dimension : `Dimension`
341 Dimension to obtain a value for.
343 Returns
344 -------
345 value : `int` or `str`
346 Value for this dimension.
347 invented : `bool`
348 `True` if the value had to be invented, `False` if a compatible value
349 already existed.
350 """
351 # No values given by caller for this dimension. See if any exist
352 # in the registry that are consistent with the values of dimensions
353 # we do have:
354 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names}
355 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1))
356 if not matches:
357 # Nothing in the registry matches: invent a data ID value
358 # with the right type (actual value does not matter).
359 # We may or may not actually make a record with this; that's
360 # easier to check later.
361 dimension_value = _makeRandomDataIdValue(dimension)
362 return dimension_value, True
363 else:
364 # A record does exist in the registry. Use its data ID value.
365 dim_value = matches[0].dataId[dimension.name]
366 assert dim_value is not None
367 return dim_value, False
370def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]:
371 """Create a dictionary that can be used to build a `DimensionRecord` that
372 is consistent with the given data ID.
374 Parameters
375 ----------
376 data_id : `dict` [ `str`, `str` or `int` ]
377 Dictionary that contains values for at least all of
378 ``dimension.dimensions.names`` (the main dimension, its recursive
379 required dependencies, and its non-recursive implied dependencies).
380 dimension : `Dimension`
381 Dimension to build a record dictionary for.
383 Returns
384 -------
385 record_dict : `dict` [ `str`, `object` ]
386 Dictionary that can be passed as ``**kwargs`` to this dimensions
387 record class constructor.
388 """
389 # Add the primary key field for this dimension.
390 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]}
391 # Define secondary keys (e.g., detector name given detector id)
392 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name]))
393 # Set the foreign key values for any related dimensions that should
394 # appear in the record.
395 for related_dimension in dimension.dimensions:
396 if related_dimension.name != dimension.name:
397 record_dict[related_dimension.name] = data_id[related_dimension.name]
398 return record_dict
401def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None:
402 """Add the records that back a new data ID to a repository.
404 Parameters
405 ----------
406 butler : `lsst.daf.butler.Butler`
407 The repository to update.
408 dimension : `str`
409 The name of the dimension to gain a new value.
410 value
411 The value to register for the dimension.
412 **related
413 Any existing dimensions to be linked to ``value``.
415 Notes
416 -----
417 Related dimensions (e.g., the instrument associated with a detector) may be
418 specified using ``related``, which requires a value for those dimensions to
419 have been added to the repository already (generally with a previous call
420 to `addDataIdValue`. Any dependencies of the given dimension that are not
421 included in ``related`` will be linked to existing values arbitrarily, and
422 (for implied dependencies only) created and also inserted into the registry
423 if they do not exist. Values for required dimensions and those given in
424 ``related`` are never created.
426 Because this function creates filler data, it is only suitable for test
427 repositories. It should not be used for repositories intended for real data
428 processing or analysis, which have known dimension values.
430 Examples
431 --------
432 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples.
433 """
434 # Example is not doctest, because it's probably unsafe to create even an
435 # in-memory butler in that environment.
436 try:
437 fullDimension = butler.dimensions[dimension]
438 except KeyError as e:
439 raise ValueError from e
440 # Bad keys ignored by registry code
441 extraKeys = related.keys() - fullDimension.graph.dimensions.names
442 if extraKeys:
443 raise ValueError(
444 f"Unexpected keywords {extraKeys} not found in {fullDimension.graph.dimensions.names}"
445 )
447 # Assemble a dictionary data ID holding the given primary dimension value
448 # and all of the related ones.
449 data_id: dict[str, int | str] = {dimension: value}
450 data_id.update(related)
452 # Compute the set of all dimensions that these recursively depend on.
453 all_dimensions = butler.dimensions.extract(data_id.keys())
455 # Create dicts that will become DimensionRecords for all of these data IDs.
456 # This iteration is guaranteed to be in topological order, so we can count
457 # on new data ID values being invented before they are needed.
458 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {}
459 for dimension_obj in all_dimensions:
460 dimension_value = data_id.get(dimension_obj.name)
461 if dimension_value is None:
462 data_id[dimension_obj.name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj)
463 if not invented:
464 # No need to make a new record; one already exists.
465 continue
466 if dimension_obj.name in related:
467 # Caller passed in a value of this dimension explicitly, but it
468 # isn't the primary dimension they asked to have a record created
469 # for. That means they expect this record to already exist.
470 continue
471 if dimension_obj != fullDimension and dimension_obj in all_dimensions.required:
472 # We also don't want to automatically create new dimension records
473 # for required dimensions (except for the main dimension the caller
474 # asked for); those are also asserted by the caller to already
475 # exist.
476 continue
477 if dimension_obj.viewOf is not None:
478 # Don't need to bother generating full records for dimensions whose
479 # records are just a view into some other's records anyway.
480 continue
481 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj)
483 # Sync those dimension record dictionaries with the database.
484 for dimension_obj, record_dict in record_dicts_by_dimension.items():
485 record = dimension_obj.RecordClass(**record_dict)
486 try:
487 butler.registry.syncDimensionData(dimension_obj, record)
488 except sqlalchemy.exc.IntegrityError as e:
489 raise RuntimeError(
490 "Could not create data ID value. Automatic relationship generation "
491 "may have failed; try adding keywords to assign a specific instrument, "
492 "physical_filter, etc. based on the nested exception message."
493 ) from e
496def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType:
497 """Add a new dataset type to a repository.
499 Parameters
500 ----------
501 butler : `lsst.daf.butler.Butler`
502 The repository to update.
503 name : `str`
504 The name of the dataset type.
505 dimensions : `set` [`str`]
506 The dimensions of the new dataset type.
507 storageClass : `str`
508 The storage class the dataset will use.
510 Returns
511 -------
512 datasetType : `lsst.daf.butler.DatasetType`
513 The new type.
515 Raises
516 ------
517 ValueError
518 Raised if the dimensions or storage class is invalid.
520 Notes
521 -----
522 Dataset types are shared across all collections in a repository, so this
523 function does not need to be run for each collection.
524 """
525 try:
526 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.dimensions)
527 butler.registry.registerDatasetType(datasetType)
528 return datasetType
529 except KeyError as e:
530 raise ValueError from e
533class DatastoreMock:
534 """Mocks a butler datastore.
536 Has functions that mock the datastore in a butler. Provides an `apply`
537 function to replace the relevent butler datastore functions with the mock
538 functions.
539 """
541 @staticmethod
542 def apply(butler: Butler) -> None:
543 """Apply datastore mocks to a butler."""
544 butler.datastore.export = DatastoreMock._mock_export # type: ignore
545 butler.datastore.get = DatastoreMock._mock_get # type: ignore
546 butler.datastore.ingest = MagicMock() # type: ignore
548 @staticmethod
549 def _mock_export(
550 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None
551 ) -> Iterable[FileDataset]:
552 """Mock of `Datastore.export` that satisfies the requirement that
553 the refs passed in are included in the `FileDataset` objects
554 returned.
556 This can be used to construct a `Datastore` mock that can be used
557 in repository export via::
559 datastore = unittest.mock.Mock(spec=Datastore)
560 datastore.export = DatastoreMock._mock_export
562 """
563 for ref in refs:
564 yield FileDataset(
565 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter"
566 )
568 @staticmethod
569 def _mock_get(
570 ref: DatasetRef,
571 parameters: Mapping[str, Any] | None = None,
572 storageClass: StorageClass | str | None = None,
573 ) -> tuple[DatasetId, Mapping[str, Any] | None]:
574 """Mock of `Datastore.get` that just returns the integer dataset ID
575 value and parameters it was given.
576 """
577 return (ref.id, parameters)