Coverage for python / lsst / daf / butler / tests / _testRepo.py: 12%
145 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-01 08:18 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-01 08:18 +0000
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = [
31 "DatastoreMock",
32 "addDataIdValue",
33 "addDatasetType",
34 "expandUniqueId",
35 "makeTestCollection",
36 "makeTestRepo",
37]
39import random
40from collections.abc import Iterable, Mapping
41from typing import TYPE_CHECKING, Any
42from unittest.mock import MagicMock
44import sqlalchemy
46from lsst.daf.butler import (
47 Butler,
48 Config,
49 DataCoordinate,
50 DatasetRef,
51 DatasetType,
52 Dimension,
53 DimensionUniverse,
54 FileDataset,
55 StorageClass,
56)
58if TYPE_CHECKING:
59 from lsst.daf.butler import DatasetId
62def makeTestRepo(
63 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any
64) -> Butler:
65 """Create an empty test repository.
67 Parameters
68 ----------
69 root : `str`
70 The location of the root directory for the repository.
71 dataIds : `~collections.abc.Mapping` \
72 [`str`, `~collections.abc.Iterable`], optional
73 A mapping keyed by the dimensions used in the test. Each value is an
74 iterable of names for that dimension (e.g., detector IDs for
75 ``"detector"``). Related dimensions (e.g., instruments and detectors)
76 are linked arbitrarily, with values created for implied dimensions only
77 when needed. This parameter is provided for compatibility with old
78 code; newer code should make the repository, then call
79 `~lsst.daf.butler.tests.addDataIdValue`.
80 config : `lsst.daf.butler.Config`, optional
81 A configuration for the repository (for details, see
82 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository
83 with default dataset and storage types, but optimized for speed. The
84 defaults set ``.datastore.cls``, ``.datastore.checksum`` and
85 ``.registry.db``. If a supplied config does not specify these values
86 the internal defaults will be used to ensure that we have a usable
87 configuration.
88 **kwargs
89 Extra arguments to `lsst.daf.butler.Butler.makeRepo`.
91 Returns
92 -------
93 butler : `lsst.daf.butler.Butler`
94 A Butler referring to the new repository. This Butler is provided only
95 for additional setup; to keep test cases isolated, it is highly
96 recommended that each test create its own Butler with a unique
97 run/collection. See `makeTestCollection`.
99 Notes
100 -----
101 This function provides a "quick and dirty" repository for simple unit tests
102 that don't depend on complex data relationships. It is ill-suited for tests
103 where the structure of the data matters. If you need such a dataset, create
104 it directly or use a saved test dataset.
105 """
106 defaults = Config()
107 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore"
108 defaults["datastore", "checksum"] = False # In case of future changes
109 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
111 if config:
112 defaults.update(config)
114 if not dataIds:
115 dataIds = {}
117 # Disable config root by default so that our registry override will
118 # not be ignored.
119 # newConfig guards against location-related keywords like outfile
120 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs)
121 butler = Butler.from_config(newConfig, writeable=True)
122 dimensionRecords = _makeRecords(dataIds, butler.dimensions)
123 for dimension, records in dimensionRecords.items():
124 if butler.dimensions[dimension].has_own_table:
125 butler.registry.insertDimensionData(dimension, *records)
126 return butler
129def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler:
130 """Create a read/write Butler to a fresh collection.
132 Parameters
133 ----------
134 repo : `lsst.daf.butler.Butler`
135 A previously existing Butler to a repository, such as that returned by
136 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`.
137 uniqueId : `str`, optional
138 A collection ID guaranteed by external code to be unique across all
139 calls to ``makeTestCollection`` for the same repository.
141 Returns
142 -------
143 butler : `lsst.daf.butler.Butler`
144 A Butler referring to a new collection in the repository at ``root``.
145 The collection is (almost) guaranteed to be new.
147 Notes
148 -----
149 This function creates a single run collection that does not necessarily
150 conform to any repository conventions. It is only suitable for creating an
151 isolated test area, and not for repositories intended for real data
152 processing or analysis.
153 """
154 if not uniqueId:
155 # Create a "random" collection name
156 # Speed matters more than cryptographic guarantees
157 uniqueId = str(random.randrange(1_000_000_000))
158 collection = "test_" + uniqueId
159 return Butler.from_config(butler=repo, run=collection)
162def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]:
163 """Create cross-linked dimension records from a collection of
164 data ID values.
166 Parameters
167 ----------
168 dataIds : `~collections.abc.Mapping` [`str`, `~collections.abc.Iterable`]
169 A mapping keyed by the dimensions of interest. Each value is an
170 iterable of names for that dimension (e.g., detector IDs for
171 ``"detector"``).
172 universe : lsst.daf.butler.DimensionUniverse
173 Set of all known dimensions and their relationships.
175 Returns
176 -------
177 dataIds : `~collections.abc.Mapping` [`str`, `~collections.abc.Iterable`]
178 A mapping keyed by the dimensions of interest, giving one
179 `~lsst.daf.butler.DimensionRecord` for each input name. Related
180 dimensions (e.g., instruments and detectors) are linked arbitrarily.
181 """
182 # Create values for all dimensions that are (recursive) required or implied
183 # dependencies of the given ones.
184 complete_data_id_values = {}
185 for dimension_name in universe.conform(dataIds.keys()).names:
186 if dimension_name in dataIds:
187 complete_data_id_values[dimension_name] = list(dataIds[dimension_name])
188 if dimension_name not in complete_data_id_values:
189 complete_data_id_values[dimension_name] = [
190 _makeRandomDataIdValue(universe.dimensions[dimension_name])
191 ]
193 # Start populating dicts that will become DimensionRecords by providing
194 # alternate keys like detector names
195 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {}
196 for name, values in complete_data_id_values.items():
197 record_dicts_by_dimension_name[name] = []
198 dimension_el = universe[name]
199 for value in values:
200 # _fillAllKeys wants Dimension and not DimensionElement.
201 # universe.__getitem__ says it returns DimensionElement but this
202 # really does also seem to be a Dimension here.
203 record_dicts_by_dimension_name[name].append(
204 _fillAllKeys(dimension_el, value) # type: ignore[arg-type]
205 )
207 # Pick cross-relationships arbitrarily
208 for name, record_dicts in record_dicts_by_dimension_name.items():
209 dimension_el = universe[name]
210 for record_dict in record_dicts:
211 for other in dimension_el.dimensions:
212 if other != dimension_el:
213 relation = record_dicts_by_dimension_name[other.name][0]
214 record_dict[other.name] = relation[other.primaryKey.name]
216 return {
217 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts]
218 for dimension, record_dicts in record_dicts_by_dimension_name.items()
219 }
222def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]:
223 """Create an arbitrary mapping of all required keys for a given dimension
224 that do not refer to other dimensions.
226 Parameters
227 ----------
228 dimension : `lsst.daf.butler.Dimension`
229 The dimension for which to generate a set of keys (e.g., detector).
230 value
231 The value assigned to ``dimension`` (e.g., detector ID).
233 Returns
234 -------
235 expandedValue : `dict` [`str`]
236 A mapping of dimension keys to values. ``dimension's`` primary key
237 maps to ``value``, but all other mappings (e.g., detector name)
238 are arbitrary.
239 """
240 expandedValue: dict[str, str | int | bytes] = {}
241 for key in dimension.uniqueKeys:
242 if key.nbytes:
243 # For `bytes` fields, we want something that casts at least `str`
244 # and `int` values to bytes and yields b'' when called with no
245 # arguments (as in the except block below). Unfortunately, the
246 # `bytes` type itself fails for both `str` and `int`, but this
247 # lambda does what we need. This particularly important for the
248 # skymap dimensions' bytes 'hash' field, which has a unique
249 # constraint; without this, all skymaps would get a hash of b''
250 # and end up conflicting.
251 castType = lambda *args: str(*args).encode() # noqa: E731
252 else:
253 castType = key.dtype().python_type
254 try:
255 castValue = castType(value)
256 except TypeError:
257 castValue = castType()
258 expandedValue[key.name] = castValue
259 for key in dimension.metadata:
260 if not key.nullable:
261 expandedValue[key.name] = key.dtype().python_type(value)
262 return expandedValue
265def _makeRandomDataIdValue(dimension: Dimension) -> int | str:
266 """Generate a random value of the appropriate type for a data ID key.
268 Parameters
269 ----------
270 dimension : `Dimension`
271 Dimension the value corresponds to.
273 Returns
274 -------
275 value : `int` or `str`
276 Random value.
277 """
278 if dimension.primaryKey.getPythonType() is str:
279 return str(random.randrange(1000))
280 else:
281 return random.randrange(1000)
284def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate:
285 """Return a complete data ID matching some criterion.
287 Parameters
288 ----------
289 butler : `lsst.daf.butler.Butler`
290 The repository to query.
291 partialId : `~collections.abc.Mapping` [`str`]
292 A mapping of known dimensions and values.
294 Returns
295 -------
296 dataId : `lsst.daf.butler.DataCoordinate`
297 The unique data ID that matches ``partialId``.
299 Raises
300 ------
301 ValueError
302 Raised if ``partialId`` does not uniquely identify a data ID.
304 Notes
305 -----
306 This method will only work correctly if all dimensions attached to the
307 target dimension (eg., "physical_filter" for "visit") are known to the
308 repository, even if they're not needed to identify a dataset. This function
309 is only suitable for certain kinds of test repositories, and not for
310 repositories intended for real data processing or analysis.
312 Examples
313 --------
314 .. code-block:: py
316 >>> butler = makeTestRepo(
317 "testdir", {"instrument": ["notACam"], "detector": [1]})
318 >>> expandUniqueId(butler, {"detector": 1})
319 DataCoordinate({instrument, detector}, ('notACam', 1))
320 """
321 # The example is *not* a doctest because it requires dangerous I/O
322 registry = butler.registry
323 dimensions = registry.dimensions.conform(partialId.keys()).required
325 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items())
327 # Much of the purpose of this function is to do something we explicitly
328 # reject most of the time: query for a governor dimension (e.g. instrument)
329 # given something that depends on it (e.g. visit), hence check=False.
330 dataId = list(registry.queryDataIds(dimensions, where=query, check=False))
331 if len(dataId) == 1:
332 return dataId[0]
333 else:
334 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.")
337def _findOrInventDataIdValue(
338 butler: Butler, data_id: dict[str, str | int], dimension: Dimension
339) -> tuple[str | int, bool]:
340 """Look up an arbitrary value for a dimension that is consistent with a
341 partial data ID that does not specify that dimension, or invent one if no
342 such value exists.
344 Parameters
345 ----------
346 butler : `Butler`
347 Butler to use to look up data ID values.
348 data_id : `dict` [ `str`, `str` or `int` ]
349 Dictionary of possibly-related data ID values.
350 dimension : `Dimension`
351 Dimension to obtain a value for.
353 Returns
354 -------
355 value : `int` or `str`
356 Value for this dimension.
357 invented : `bool`
358 `True` if the value had to be invented, `False` if a compatible value
359 already existed.
360 """
361 # No values given by caller for this dimension. See if any exist
362 # in the registry that are consistent with the values of dimensions
363 # we do have:
364 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names}
365 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1))
366 if not matches:
367 # Nothing in the registry matches: invent a data ID value
368 # with the right type (actual value does not matter).
369 # We may or may not actually make a record with this; that's
370 # easier to check later.
371 dimension_value = _makeRandomDataIdValue(dimension)
372 return dimension_value, True
373 else:
374 # A record does exist in the registry. Use its data ID value.
375 dim_value = matches[0].dataId[dimension.name]
376 assert dim_value is not None
377 return dim_value, False
380def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]:
381 """Create a dictionary that can be used to build a `DimensionRecord` that
382 is consistent with the given data ID.
384 Parameters
385 ----------
386 data_id : `dict` [ `str`, `str` or `int` ]
387 Dictionary that contains values for at least all of
388 ``dimension.dimensions.names`` (the main dimension, its recursive
389 required dependencies, and its non-recursive implied dependencies).
390 dimension : `Dimension`
391 Dimension to build a record dictionary for.
393 Returns
394 -------
395 record_dict : `dict` [ `str`, `object` ]
396 Dictionary that can be passed as ``**kwargs`` to this dimensions
397 record class constructor.
398 """
399 # Add the primary key field for this dimension.
400 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]}
401 # Define secondary keys (e.g., detector name given detector id)
402 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name]))
403 # Set the foreign key values for any related dimensions that should
404 # appear in the record.
405 for related_dimension in dimension.dimensions:
406 if related_dimension.name != dimension.name:
407 record_dict[related_dimension.name] = data_id[related_dimension.name]
408 return record_dict
411def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None:
412 """Add the records that back a new data ID to a repository.
414 Parameters
415 ----------
416 butler : `lsst.daf.butler.Butler`
417 The repository to update.
418 dimension : `str`
419 The name of the dimension to gain a new value.
420 value : `str` or `int`
421 The value to register for the dimension.
422 **related : `typing.Any`
423 Any existing dimensions to be linked to ``value``.
425 Notes
426 -----
427 Related dimensions (e.g., the instrument associated with a detector) may be
428 specified using ``related``, which requires a value for those dimensions to
429 have been added to the repository already (generally with a previous call
430 to `addDataIdValue`. Any dependencies of the given dimension that are not
431 included in ``related`` will be linked to existing values arbitrarily, and
432 (for implied dependencies only) created and also inserted into the registry
433 if they do not exist. Values for required dimensions and those given in
434 ``related`` are never created.
436 Because this function creates filler data, it is only suitable for test
437 repositories. It should not be used for repositories intended for real data
438 processing or analysis, which have known dimension values.
440 Examples
441 --------
442 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples.
443 """
444 # Example is not doctest, because it's probably unsafe to create even an
445 # in-memory butler in that environment.
446 try:
447 full_dimension = butler.dimensions[dimension]
448 except KeyError as e:
449 raise ValueError from e
450 # Bad keys ignored by registry code
451 extra_keys = related.keys() - full_dimension.minimal_group.names
452 if extra_keys:
453 raise ValueError(
454 f"Unexpected keywords {extra_keys} not found in {full_dimension.minimal_group.names}"
455 )
457 # Assemble a dictionary data ID holding the given primary dimension value
458 # and all of the related ones.
459 data_id: dict[str, int | str] = {dimension: value}
460 data_id.update(related)
462 # Compute the set of all dimensions that these recursively depend on.
463 all_dimensions = butler.dimensions.conform(data_id.keys())
465 # Create dicts that will become DimensionRecords for all of these data IDs.
466 # This iteration is guaranteed to be in topological order, so we can count
467 # on new data ID values being invented before they are needed.
468 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {}
469 for dimension_name in all_dimensions.names:
470 dimension_obj = butler.dimensions.dimensions[dimension_name]
471 dimension_value = data_id.get(dimension_name)
472 if dimension_value is None:
473 data_id[dimension_name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj)
474 if not invented:
475 # No need to make a new record; one already exists.
476 continue
477 if dimension_name in related:
478 # Caller passed in a value of this dimension explicitly, but it
479 # isn't the primary dimension they asked to have a record created
480 # for. That means they expect this record to already exist.
481 continue
482 if dimension_name != dimension and dimension_name in all_dimensions.required:
483 # We also don't want to automatically create new dimension records
484 # for required dimensions (except for the main dimension the caller
485 # asked for); those are also asserted by the caller to already
486 # exist.
487 continue
488 if not dimension_obj.has_own_table:
489 # Don't need to bother generating full records for dimensions whose
490 # records are not actually stored.
491 continue
492 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj)
494 # Sync those dimension record dictionaries with the database.
495 for dimension_obj, record_dict in record_dicts_by_dimension.items():
496 record = dimension_obj.RecordClass(**record_dict)
497 try:
498 butler.registry.syncDimensionData(dimension_obj, record)
499 except sqlalchemy.exc.IntegrityError as e:
500 raise RuntimeError(
501 "Could not create data ID value. Automatic relationship generation "
502 "may have failed; try adding keywords to assign a specific instrument, "
503 "physical_filter, etc. based on the nested exception message."
504 ) from e
507def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType:
508 """Add a new dataset type to a repository.
510 Parameters
511 ----------
512 butler : `lsst.daf.butler.Butler`
513 The repository to update.
514 name : `str`
515 The name of the dataset type.
516 dimensions : `set` [`str`]
517 The dimensions of the new dataset type.
518 storageClass : `str`
519 The storage class the dataset will use.
521 Returns
522 -------
523 datasetType : `lsst.daf.butler.DatasetType`
524 The new type.
526 Raises
527 ------
528 ValueError
529 Raised if the dimensions or storage class is invalid.
531 Notes
532 -----
533 Dataset types are shared across all collections in a repository, so this
534 function does not need to be run for each collection.
535 """
536 try:
537 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.dimensions)
538 butler.registry.registerDatasetType(datasetType)
539 return datasetType
540 except KeyError as e:
541 raise ValueError from e
544class DatastoreMock:
545 """Mocks a butler datastore.
547 Has functions that mock the datastore in a butler. Provides an `apply`
548 function to replace the relevent butler datastore functions with the mock
549 functions.
550 """
552 @staticmethod
553 def apply(butler: Butler) -> None:
554 """Apply datastore mocks to a butler.
556 Parameters
557 ----------
558 butler : `~lsst.daf.butler.Butler`
559 Butler to be modified.
560 """
561 butler._datastore.export = DatastoreMock._mock_export # type: ignore
562 butler._datastore.get = DatastoreMock._mock_get # type: ignore
563 butler._datastore.ingest = MagicMock() # type: ignore
565 @staticmethod
566 def _mock_export(
567 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None
568 ) -> Iterable[FileDataset]:
569 """Mock of `Datastore.export` that satisfies the requirement that
570 the refs passed in are included in the `FileDataset` objects
571 returned.
573 This can be used to construct a `Datastore` mock that can be used
574 in repository export via::
576 datastore = unittest.mock.Mock(spec=Datastore)
577 datastore.export = DatastoreMock._mock_export
579 """
580 for ref in refs:
581 yield FileDataset(
582 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter"
583 )
585 @staticmethod
586 def _mock_get(
587 ref: DatasetRef,
588 parameters: Mapping[str, Any] | None = None,
589 storageClass: StorageClass | str | None = None,
590 ) -> tuple[DatasetId, Mapping[str, Any] | None]:
591 """Mock of `Datastore.get` that just returns the integer dataset ID
592 value and parameters it was given.
593 """
594 return (ref.id, parameters)