Coverage for python/lsst/daf/butler/tests/_testRepo.py: 14%
145 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 03:44 -0700
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-19 03:44 -0700
1# This file is part of daf_butler.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This software is dual licensed under the GNU General Public License and also
10# under a 3-clause BSD license. Recipients may choose which of these licenses
11# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12# respectively. If you choose the GPL option then the following text applies
13# (but note that there is still no warranty even if you opt for BSD instead):
14#
15# This program is free software: you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation, either version 3 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program. If not, see <http://www.gnu.org/licenses/>.
28from __future__ import annotations
30__all__ = [
31 "makeTestRepo",
32 "makeTestCollection",
33 "addDatasetType",
34 "expandUniqueId",
35 "DatastoreMock",
36 "addDataIdValue",
37]
39import random
40from collections.abc import Iterable, Mapping
41from typing import TYPE_CHECKING, Any
42from unittest.mock import MagicMock
44import sqlalchemy
45from lsst.daf.butler import (
46 Butler,
47 Config,
48 DataCoordinate,
49 DatasetRef,
50 DatasetType,
51 Dimension,
52 DimensionUniverse,
53 FileDataset,
54 StorageClass,
55)
57if TYPE_CHECKING:
58 from lsst.daf.butler import DatasetId
61def makeTestRepo(
62 root: str, dataIds: Mapping[str, Iterable] | None = None, *, config: Config | None = None, **kwargs: Any
63) -> Butler:
64 """Create an empty test repository.
66 Parameters
67 ----------
68 root : `str`
69 The location of the root directory for the repository.
70 dataIds : `~collections.abc.Mapping` [`str`, `iterable`], optional
71 A mapping keyed by the dimensions used in the test. Each value is an
72 iterable of names for that dimension (e.g., detector IDs for
73 `"detector"`). Related dimensions (e.g., instruments and detectors) are
74 linked arbitrarily, with values created for implied dimensions only
75 when needed. This parameter is provided for compatibility with old
76 code; newer code should make the repository, then call
77 `~lsst.daf.butler.tests.addDataIdValue`.
78 config : `lsst.daf.butler.Config`, optional
79 A configuration for the repository (for details, see
80 `lsst.daf.butler.Butler.makeRepo`). If omitted, creates a repository
81 with default dataset and storage types, but optimized for speed. The
82 defaults set ``.datastore.cls``, ``.datastore.checksum`` and
83 ``.registry.db``. If a supplied config does not specify these values
84 the internal defaults will be used to ensure that we have a usable
85 configuration.
86 **kwargs
87 Extra arguments to `lsst.daf.butler.Butler.makeRepo`.
89 Returns
90 -------
91 butler : `lsst.daf.butler.Butler`
92 A Butler referring to the new repository. This Butler is provided only
93 for additional setup; to keep test cases isolated, it is highly
94 recommended that each test create its own Butler with a unique
95 run/collection. See `makeTestCollection`.
97 Notes
98 -----
99 This function provides a "quick and dirty" repository for simple unit tests
100 that don't depend on complex data relationships. It is ill-suited for tests
101 where the structure of the data matters. If you need such a dataset, create
102 it directly or use a saved test dataset.
103 """
104 defaults = Config()
105 defaults["datastore", "cls"] = "lsst.daf.butler.datastores.inMemoryDatastore.InMemoryDatastore"
106 defaults["datastore", "checksum"] = False # In case of future changes
107 defaults["registry", "db"] = "sqlite:///<butlerRoot>/gen3.sqlite3"
109 if config:
110 defaults.update(config)
112 if not dataIds:
113 dataIds = {}
115 # Disable config root by default so that our registry override will
116 # not be ignored.
117 # newConfig guards against location-related keywords like outfile
118 newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs)
119 butler = Butler.from_config(newConfig, writeable=True)
120 dimensionRecords = _makeRecords(dataIds, butler.dimensions)
121 for dimension, records in dimensionRecords.items():
122 if butler.dimensions[dimension].has_own_table:
123 butler.registry.insertDimensionData(dimension, *records)
124 return butler
127def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler:
128 """Create a read/write Butler to a fresh collection.
130 Parameters
131 ----------
132 repo : `lsst.daf.butler.Butler`
133 A previously existing Butler to a repository, such as that returned by
134 `~lsst.daf.butler.Butler.makeRepo` or `makeTestRepo`.
135 uniqueId : `str`, optional
136 A collection ID guaranteed by external code to be unique across all
137 calls to ``makeTestCollection`` for the same repository.
139 Returns
140 -------
141 butler : `lsst.daf.butler.Butler`
142 A Butler referring to a new collection in the repository at ``root``.
143 The collection is (almost) guaranteed to be new.
145 Notes
146 -----
147 This function creates a single run collection that does not necessarily
148 conform to any repository conventions. It is only suitable for creating an
149 isolated test area, and not for repositories intended for real data
150 processing or analysis.
151 """
152 if not uniqueId:
153 # Create a "random" collection name
154 # Speed matters more than cryptographic guarantees
155 uniqueId = str(random.randrange(1_000_000_000))
156 collection = "test_" + uniqueId
157 return Butler.from_config(butler=repo, run=collection)
160def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]:
161 """Create cross-linked dimension records from a collection of
162 data ID values.
164 Parameters
165 ----------
166 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
167 A mapping keyed by the dimensions of interest. Each value is an
168 iterable of names for that dimension (e.g., detector IDs for
169 `"detector"`).
170 universe : lsst.daf.butler.DimensionUniverse
171 Set of all known dimensions and their relationships.
173 Returns
174 -------
175 dataIds : `~collections.abc.Mapping` [`str`, `iterable`]
176 A mapping keyed by the dimensions of interest, giving one
177 `~lsst.daf.butler.DimensionRecord` for each input name. Related
178 dimensions (e.g., instruments and detectors) are linked arbitrarily.
179 """
180 # Create values for all dimensions that are (recursive) required or implied
181 # dependencies of the given ones.
182 complete_data_id_values = {}
183 for dimension_name in universe.conform(dataIds.keys()).names:
184 if dimension_name in dataIds:
185 complete_data_id_values[dimension_name] = list(dataIds[dimension_name])
186 if dimension_name not in complete_data_id_values:
187 complete_data_id_values[dimension_name] = [
188 _makeRandomDataIdValue(universe.dimensions[dimension_name])
189 ]
191 # Start populating dicts that will become DimensionRecords by providing
192 # alternate keys like detector names
193 record_dicts_by_dimension_name: dict[str, list[dict[str, str | int | bytes]]] = {}
194 for name, values in complete_data_id_values.items():
195 record_dicts_by_dimension_name[name] = []
196 dimension_el = universe[name]
197 for value in values:
198 # _fillAllKeys wants Dimension and not DimensionElement.
199 # universe.__getitem__ says it returns DimensionElement but this
200 # really does also seem to be a Dimension here.
201 record_dicts_by_dimension_name[name].append(
202 _fillAllKeys(dimension_el, value) # type: ignore[arg-type]
203 )
205 # Pick cross-relationships arbitrarily
206 for name, record_dicts in record_dicts_by_dimension_name.items():
207 dimension_el = universe[name]
208 for record_dict in record_dicts:
209 for other in dimension_el.dimensions:
210 if other != dimension_el:
211 relation = record_dicts_by_dimension_name[other.name][0]
212 record_dict[other.name] = relation[other.primaryKey.name]
214 return {
215 dimension: [universe[dimension].RecordClass(**record_dict) for record_dict in record_dicts]
216 for dimension, record_dicts in record_dicts_by_dimension_name.items()
217 }
220def _fillAllKeys(dimension: Dimension, value: str | int) -> dict[str, str | int | bytes]:
221 """Create an arbitrary mapping of all required keys for a given dimension
222 that do not refer to other dimensions.
224 Parameters
225 ----------
226 dimension : `lsst.daf.butler.Dimension`
227 The dimension for which to generate a set of keys (e.g., detector).
228 value
229 The value assigned to ``dimension`` (e.g., detector ID).
231 Returns
232 -------
233 expandedValue : `dict` [`str`]
234 A mapping of dimension keys to values. ``dimension's`` primary key
235 maps to ``value``, but all other mappings (e.g., detector name)
236 are arbitrary.
237 """
238 expandedValue: dict[str, str | int | bytes] = {}
239 for key in dimension.uniqueKeys:
240 if key.nbytes:
241 # For `bytes` fields, we want something that casts at least `str`
242 # and `int` values to bytes and yields b'' when called with no
243 # arguments (as in the except block below). Unfortunately, the
244 # `bytes` type itself fails for both `str` and `int`, but this
245 # lambda does what we need. This particularly important for the
246 # skymap dimensions' bytes 'hash' field, which has a unique
247 # constraint; without this, all skymaps would get a hash of b''
248 # and end up conflicting.
249 castType = lambda *args: str(*args).encode() # noqa: E731
250 else:
251 castType = key.dtype().python_type
252 try:
253 castValue = castType(value)
254 except TypeError:
255 castValue = castType()
256 expandedValue[key.name] = castValue
257 for key in dimension.metadata:
258 if not key.nullable:
259 expandedValue[key.name] = key.dtype().python_type(value)
260 return expandedValue
263def _makeRandomDataIdValue(dimension: Dimension) -> int | str:
264 """Generate a random value of the appropriate type for a data ID key.
266 Parameters
267 ----------
268 dimension : `Dimension`
269 Dimension the value corresponds to.
271 Returns
272 -------
273 value : `int` or `str`
274 Random value.
275 """
276 if dimension.primaryKey.getPythonType() is str:
277 return str(random.randrange(1000))
278 else:
279 return random.randrange(1000)
282def expandUniqueId(butler: Butler, partialId: Mapping[str, Any]) -> DataCoordinate:
283 """Return a complete data ID matching some criterion.
285 Parameters
286 ----------
287 butler : `lsst.daf.butler.Butler`
288 The repository to query.
289 partialId : `~collections.abc.Mapping` [`str`]
290 A mapping of known dimensions and values.
292 Returns
293 -------
294 dataId : `lsst.daf.butler.DataCoordinate`
295 The unique data ID that matches ``partialId``.
297 Raises
298 ------
299 ValueError
300 Raised if ``partialId`` does not uniquely identify a data ID.
302 Notes
303 -----
304 This method will only work correctly if all dimensions attached to the
305 target dimension (eg., "physical_filter" for "visit") are known to the
306 repository, even if they're not needed to identify a dataset. This function
307 is only suitable for certain kinds of test repositories, and not for
308 repositories intended for real data processing or analysis.
310 Examples
311 --------
312 .. code-block:: py
314 >>> butler = makeTestRepo(
315 "testdir", {"instrument": ["notACam"], "detector": [1]})
316 >>> expandUniqueId(butler, {"detector": 1})
317 DataCoordinate({instrument, detector}, ('notACam', 1))
318 """
319 # The example is *not* a doctest because it requires dangerous I/O
320 registry = butler.registry
321 dimensions = registry.dimensions.conform(partialId.keys()).required
323 query = " AND ".join(f"{dimension} = {value!r}" for dimension, value in partialId.items())
325 # Much of the purpose of this function is to do something we explicitly
326 # reject most of the time: query for a governor dimension (e.g. instrument)
327 # given something that depends on it (e.g. visit), hence check=False.
328 dataId = list(registry.queryDataIds(dimensions, where=query, check=False))
329 if len(dataId) == 1:
330 return dataId[0]
331 else:
332 raise ValueError(f"Found {len(dataId)} matches for {partialId}, expected 1.")
335def _findOrInventDataIdValue(
336 butler: Butler, data_id: dict[str, str | int], dimension: Dimension
337) -> tuple[str | int, bool]:
338 """Look up an arbitrary value for a dimension that is consistent with a
339 partial data ID that does not specify that dimension, or invent one if no
340 such value exists.
342 Parameters
343 ----------
344 butler : `Butler`
345 Butler to use to look up data ID values.
346 data_id : `dict` [ `str`, `str` or `int` ]
347 Dictionary of possibly-related data ID values.
348 dimension : `Dimension`
349 Dimension to obtain a value for.
351 Returns
352 -------
353 value : `int` or `str`
354 Value for this dimension.
355 invented : `bool`
356 `True` if the value had to be invented, `False` if a compatible value
357 already existed.
358 """
359 # No values given by caller for this dimension. See if any exist
360 # in the registry that are consistent with the values of dimensions
361 # we do have:
362 match_data_id = {key: data_id[key] for key in data_id.keys() & dimension.dimensions.names}
363 matches = list(butler.registry.queryDimensionRecords(dimension, dataId=match_data_id).limit(1))
364 if not matches:
365 # Nothing in the registry matches: invent a data ID value
366 # with the right type (actual value does not matter).
367 # We may or may not actually make a record with this; that's
368 # easier to check later.
369 dimension_value = _makeRandomDataIdValue(dimension)
370 return dimension_value, True
371 else:
372 # A record does exist in the registry. Use its data ID value.
373 dim_value = matches[0].dataId[dimension.name]
374 assert dim_value is not None
375 return dim_value, False
378def _makeDimensionRecordDict(data_id: dict[str, str | int], dimension: Dimension) -> dict[str, Any]:
379 """Create a dictionary that can be used to build a `DimensionRecord` that
380 is consistent with the given data ID.
382 Parameters
383 ----------
384 data_id : `dict` [ `str`, `str` or `int` ]
385 Dictionary that contains values for at least all of
386 ``dimension.dimensions.names`` (the main dimension, its recursive
387 required dependencies, and its non-recursive implied dependencies).
388 dimension : `Dimension`
389 Dimension to build a record dictionary for.
391 Returns
392 -------
393 record_dict : `dict` [ `str`, `object` ]
394 Dictionary that can be passed as ``**kwargs`` to this dimensions
395 record class constructor.
396 """
397 # Add the primary key field for this dimension.
398 record_dict: dict[str, Any] = {dimension.primaryKey.name: data_id[dimension.name]}
399 # Define secondary keys (e.g., detector name given detector id)
400 record_dict.update(_fillAllKeys(dimension, data_id[dimension.name]))
401 # Set the foreign key values for any related dimensions that should
402 # appear in the record.
403 for related_dimension in dimension.dimensions:
404 if related_dimension.name != dimension.name:
405 record_dict[related_dimension.name] = data_id[related_dimension.name]
406 return record_dict
409def addDataIdValue(butler: Butler, dimension: str, value: str | int, **related: str | int) -> None:
410 """Add the records that back a new data ID to a repository.
412 Parameters
413 ----------
414 butler : `lsst.daf.butler.Butler`
415 The repository to update.
416 dimension : `str`
417 The name of the dimension to gain a new value.
418 value : `str` or `int`
419 The value to register for the dimension.
420 **related : `typing.Any`
421 Any existing dimensions to be linked to ``value``.
423 Notes
424 -----
425 Related dimensions (e.g., the instrument associated with a detector) may be
426 specified using ``related``, which requires a value for those dimensions to
427 have been added to the repository already (generally with a previous call
428 to `addDataIdValue`. Any dependencies of the given dimension that are not
429 included in ``related`` will be linked to existing values arbitrarily, and
430 (for implied dependencies only) created and also inserted into the registry
431 if they do not exist. Values for required dimensions and those given in
432 ``related`` are never created.
434 Because this function creates filler data, it is only suitable for test
435 repositories. It should not be used for repositories intended for real data
436 processing or analysis, which have known dimension values.
438 Examples
439 --------
440 See the guide on :ref:`using-butler-in-tests-make-repo` for usage examples.
441 """
442 # Example is not doctest, because it's probably unsafe to create even an
443 # in-memory butler in that environment.
444 try:
445 full_dimension = butler.dimensions[dimension]
446 except KeyError as e:
447 raise ValueError from e
448 # Bad keys ignored by registry code
449 extra_keys = related.keys() - full_dimension.minimal_group.names
450 if extra_keys:
451 raise ValueError(
452 f"Unexpected keywords {extra_keys} not found in {full_dimension.minimal_group.names}"
453 )
455 # Assemble a dictionary data ID holding the given primary dimension value
456 # and all of the related ones.
457 data_id: dict[str, int | str] = {dimension: value}
458 data_id.update(related)
460 # Compute the set of all dimensions that these recursively depend on.
461 all_dimensions = butler.dimensions.conform(data_id.keys())
463 # Create dicts that will become DimensionRecords for all of these data IDs.
464 # This iteration is guaranteed to be in topological order, so we can count
465 # on new data ID values being invented before they are needed.
466 record_dicts_by_dimension: dict[Dimension, dict[str, Any]] = {}
467 for dimension_name in all_dimensions.names:
468 dimension_obj = butler.dimensions.dimensions[dimension_name]
469 dimension_value = data_id.get(dimension_name)
470 if dimension_value is None:
471 data_id[dimension_name], invented = _findOrInventDataIdValue(butler, data_id, dimension_obj)
472 if not invented:
473 # No need to make a new record; one already exists.
474 continue
475 if dimension_name in related:
476 # Caller passed in a value of this dimension explicitly, but it
477 # isn't the primary dimension they asked to have a record created
478 # for. That means they expect this record to already exist.
479 continue
480 if dimension_name != dimension and dimension_name in all_dimensions.required:
481 # We also don't want to automatically create new dimension records
482 # for required dimensions (except for the main dimension the caller
483 # asked for); those are also asserted by the caller to already
484 # exist.
485 continue
486 if not dimension_obj.has_own_table:
487 # Don't need to bother generating full records for dimensions whose
488 # records are not actually stored.
489 continue
490 record_dicts_by_dimension[dimension_obj] = _makeDimensionRecordDict(data_id, dimension_obj)
492 # Sync those dimension record dictionaries with the database.
493 for dimension_obj, record_dict in record_dicts_by_dimension.items():
494 record = dimension_obj.RecordClass(**record_dict)
495 try:
496 butler.registry.syncDimensionData(dimension_obj, record)
497 except sqlalchemy.exc.IntegrityError as e:
498 raise RuntimeError(
499 "Could not create data ID value. Automatic relationship generation "
500 "may have failed; try adding keywords to assign a specific instrument, "
501 "physical_filter, etc. based on the nested exception message."
502 ) from e
505def addDatasetType(butler: Butler, name: str, dimensions: set[str], storageClass: str) -> DatasetType:
506 """Add a new dataset type to a repository.
508 Parameters
509 ----------
510 butler : `lsst.daf.butler.Butler`
511 The repository to update.
512 name : `str`
513 The name of the dataset type.
514 dimensions : `set` [`str`]
515 The dimensions of the new dataset type.
516 storageClass : `str`
517 The storage class the dataset will use.
519 Returns
520 -------
521 datasetType : `lsst.daf.butler.DatasetType`
522 The new type.
524 Raises
525 ------
526 ValueError
527 Raised if the dimensions or storage class is invalid.
529 Notes
530 -----
531 Dataset types are shared across all collections in a repository, so this
532 function does not need to be run for each collection.
533 """
534 try:
535 datasetType = DatasetType(name, dimensions, storageClass, universe=butler.dimensions)
536 butler.registry.registerDatasetType(datasetType)
537 return datasetType
538 except KeyError as e:
539 raise ValueError from e
542class DatastoreMock:
543 """Mocks a butler datastore.
545 Has functions that mock the datastore in a butler. Provides an `apply`
546 function to replace the relevent butler datastore functions with the mock
547 functions.
548 """
550 @staticmethod
551 def apply(butler: Butler) -> None:
552 """Apply datastore mocks to a butler.
554 Parameters
555 ----------
556 butler : `~lsst.daf.butler.Butler`
557 Butler to be modified.
558 """
559 butler._datastore.export = DatastoreMock._mock_export # type: ignore
560 butler._datastore.get = DatastoreMock._mock_get # type: ignore
561 butler._datastore.ingest = MagicMock() # type: ignore
563 @staticmethod
564 def _mock_export(
565 refs: Iterable[DatasetRef], *, directory: str | None = None, transfer: str | None = None
566 ) -> Iterable[FileDataset]:
567 """Mock of `Datastore.export` that satisfies the requirement that
568 the refs passed in are included in the `FileDataset` objects
569 returned.
571 This can be used to construct a `Datastore` mock that can be used
572 in repository export via::
574 datastore = unittest.mock.Mock(spec=Datastore)
575 datastore.export = DatastoreMock._mock_export
577 """
578 for ref in refs:
579 yield FileDataset(
580 refs=[ref], path="mock/path", formatter="lsst.daf.butler.formatters.json.JsonFormatter"
581 )
583 @staticmethod
584 def _mock_get(
585 ref: DatasetRef,
586 parameters: Mapping[str, Any] | None = None,
587 storageClass: StorageClass | str | None = None,
588 ) -> tuple[DatasetId, Mapping[str, Any] | None]:
589 """Mock of `Datastore.get` that just returns the integer dataset ID
590 value and parameters it was given.
591 """
592 return (ref.id, parameters)