v7labs
diff --git a/‎README.md
Lines changed: 26 additions & 3 deletions b/‎README.md
Lines changed: 26 additions & 3 deletions
diff --git a/‎benchllm/__init__.py
Lines changed: 2 additions & 2 deletions b/‎benchllm/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchllm/cache.py
Lines changed: 96 additions & 0 deletions b/‎benchllm/cache.py
Lines changed: 96 additions & 0 deletions
diff --git a/‎benchllm/cli/commands/evaluate.py
Lines changed: 8 additions & 4 deletions b/‎benchllm/cli/commands/evaluate.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎benchllm/cli/commands/run_suite.py
Lines changed: 7 additions & 1 deletion b/‎benchllm/cli/commands/run_suite.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎benchllm/cli/evaluator.py
Lines changed: 0 additions & 78 deletions b/‎benchllm/cli/evaluator.py
Lines changed: 0 additions & 78 deletions
diff --git a/‎benchllm/cli/evaluator/__init__.py
Lines changed: 2 additions & 0 deletions b/‎benchllm/cli/evaluator/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchllm/cli/evaluator/interactive.py
Lines changed: 31 additions & 0 deletions b/‎benchllm/cli/evaluator/interactive.py
Lines changed: 31 additions & 0 deletions
diff --git a/‎benchllm/cli/evaluator/web.py
Lines changed: 47 additions & 0 deletions b/‎benchllm/cli/evaluator/web.py
Lines changed: 47 additions & 0 deletions
@@ -1,6 +1,6 @@
 # 🏋️‍♂️ BenchLLM 🏋️‍♀️
 
-🦾 Continuous Integration for LLM powered applications 🦙🦅🤖 
+🦾 Continuous Integration for LLM powered applications 🦙🦅🤖
 
 [![GitHub Repo stars](https://img.shields.io/github/stars/v7labs/BenchLLM?style=social)](https://github.com/v7labs/BenchLLM/stargazers)
 [![Twitter Follow](https://img.shields.io/twitter/follow/V7Labs?style=social)](https://twitter.com/V7Labs)
@@ -10,7 +10,6 @@
 
 BenchLLM is actively used at [V7](https://www.v7labs.com) for improving our LLM applications and is now Open Sourced under MIT License to share with the wider community
 
-
 ## 💡 Get help on [Discord](https://discord.gg/x7ExfHb3bG) or [Tweet at us](https://twitter.com/V7Labs)
 
 <hr/>
@@ -26,7 +25,7 @@ Use BenchLLM to:
 
 > ⚠️ **NOTE:** BenchLLM is in the early stage of development and will be subject to rapid changes.
 >
->For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
+> For bug reporting, feature requests, or contributions, please open an issue or submit a pull request (PR) on our GitHub page.
 
 ## 🧪 BenchLLM Testing Methodology
 
@@ -116,6 +115,16 @@ The non interactive evaluators also supports `--workers N` to run in the evaluat
 $ bench run --evaluator string-match --workers 5
 ```
 
+To accelerate the evaluation process, BenchLLM uses a cache. If a (prediction, expected) pair has been evaluated in the past and a cache was used, the evaluation output will be saved for future evaluations. There are several types of caches:
+
+- `memory`, only caches output values during the current run. This is particularly useful when running with `--retry-count N`
+- `file`, stores the cache at the end of the run as a JSON file in output/cache.json. This is the default behavior.
+- `none`, does not use any cache.
+
+```bash
+$ bench run examples --cache memory
+```
+
 ### 🧮 Eval
 
 While _bench run_ runs each test function and then evaluates their output, it can often be beneficial to separate these into two steps. For example, if you want a person to manually do the evaluation or if you want to try multiple evaluation methods on the same function.
@@ -163,6 +172,20 @@ results = evaluator.run()
 print(results)
 ```
 
+If you want to incorporate caching and run multiple parallel evaluation jobs, you can modify your evaluator as follows:
+
+```python
+from benchllm.cache import FileCache
+
+...
+
+evaluator = FileCache(StringMatchEvaluator(workers=2), Path("path/to/cache.json"))
+evaluator.load(predictions)
+results = evaluator.run()
+```
+
+In this example, `FileCache` is used to enable caching, and the `workers` parameter of `StringMatchEvaluator` is set to `2` to allow for parallel evaluations. The cache results are saved in a file specified by `Path("path/to/cache.json")`.
+
 ## ☕️ Commands
 
 - `bench add`: Add a new test to a suite.
 
@@ -1,10 +1,10 @@
 import inspect
 from pathlib import Path
-from typing import Any, Callable, Generic, Optional, Type, TypeVar
+from typing import Callable, Type, TypeVar
 
 from .data_types import Evaluation, Prediction, Test  # noqa
 from .evaluator import Evaluator, SemanticEvaluator, StringMatchEvaluator  # noqa
-from .input_types import ChatInput, SimilarityInput
+from .input_types import ChatInput, SimilarityInput  # noqa
 from .similarity import semantically_similar  # noqa
 from .singleton import TestSingleton  # noqa
 from .tester import Tester  # noqa
 
@@ -0,0 +1,96 @@
+import json
+from pathlib import Path
+from typing import Optional
+
+from benchllm.data_types import Evaluation, Prediction
+from benchllm.evaluator import Evaluator
+from benchllm.input_types import Json
+from benchllm.listener import EvaluatorListener
+
+
+class MemoryCache(Evaluator):
+    """Caches the results of the evaluator in memory"""
+
+    def __init__(self, evaluator: Evaluator):
+        super().__init__(workers=evaluator.workers)
+        self._data: dict = {}
+        self._evaluator = evaluator
+        self._num_cache_misses = 0
+        self._num_cache_hits = 0
+
+    def _key(self, answer1: Json, answer2: Json) -> str:
+        key1, key2 = json.dumps([answer1, answer2]), json.dumps([answer2, answer1])
+        return key1 if key1 < key2 else key2
+
+    def lookup(self, answer1: Json, answer2: Json) -> Optional[bool]:
+        return self._data.get(self._key(answer1, answer2), None)
+
+    def store(self, answer1: Json, answer2: Json, value: bool) -> None:
+        key = self._key(answer1, answer2)
+        self._data[key] = value
+
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        uncached_expectations = []
+        for expected in prediction.test.expected:
+            lookup = self.lookup(expected, prediction.output)
+            if lookup is None:
+                uncached_expectations.append(expected)
+            elif lookup:
+                # If we find a positive match we can stop comparing and just return.
+                # For negative matches we still need to check the other expected answers.
+                self._num_cache_hits += 1
+                return Evaluator.Match(prediction=prediction.output, expected=expected)
+
+        # If all expectations were found in the cache but were negative matches,
+        # we increment the cache hits counter and return None as there's no match.
+        if not uncached_expectations:
+            self._num_cache_hits += 1
+            return None
+
+        self._num_cache_misses += 1
+        # set prediction.test.expected to only the ones that were not cached
+        prediction = Prediction(**prediction.dict())
+        prediction.test.expected = uncached_expectations
+        result = self._evaluator.evaluate_prediction(prediction)
+        if result:
+            self.store(result.expected, result.prediction, True)
+        else:
+            for expected in prediction.test.expected:
+                self.store(expected, prediction.output, False)
+        return result
+
+    @property
+    def num_cache_hits(self) -> int:
+        return self._num_cache_hits
+
+    @property
+    def num_cache_misses(self) -> int:
+        return self._num_cache_misses
+
+
+class FileCache(MemoryCache, EvaluatorListener):
+    """Caches the results of the evaluator in a json file"""
+
+    def __init__(self, evaluator: Evaluator, path: Path):
+        super().__init__(evaluator)
+        self._path = path
+        self.add_listener(self)
+        self._load()
+
+    def _load(self) -> None:
+        if self._path.exists():
+            try:
+                cache = json.loads(self._path.read_text(encoding="UTF-8"), parse_int=str)
+                if cache["version"] != "1":
+                    raise ValueError("Unsupported cache version")
+                self._data = cache["entries"]
+            except Exception:
+                print(f"Failed to load cache file {self._path}")
+                self._data = {}
+
+    def _save(self) -> None:
+        cache = {"entries": self._data, "version": "1"}
+        self._path.write_text(json.dumps(cache, indent=4), encoding="UTF-8")
+
+    def evaluate_ended(self, evaluations: list[Evaluation]) -> None:
+        self._save()
@@ -1,13 +1,13 @@
 from pathlib import Path
 
+from benchllm.cache import FileCache
 from benchllm.cli.listener import ReportListener, RichCliListener
-from benchllm.cli.utils import get_evaluator
-from benchllm.evaluator import load_prediction_files
-from benchllm.utils import find_json_yml_files
+from benchllm.cli.utils import add_cache, get_evaluator
+from benchllm.utils import find_json_yml_files, load_prediction_files
 
 
 def evaluate_predictions(
-    file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str
+    file_or_dir: list[Path], model: str, output_dir: Path, workers: int, evaluator_name: str, cache: str
 ) -> bool:
     files = find_json_yml_files(file_or_dir)
 
@@ -17,6 +17,10 @@ def evaluate_predictions(
     load_prediction_files(file_or_dir)
 
     evaluator = get_evaluator(evaluator_name, model, workers)
+    evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json")
+
+    cli_listener.set_evaulator(evaluator)
+
     evaluator.add_listener(cli_listener)
     evaluator.add_listener(report_listener)
     for file in files:
 
@@ -2,8 +2,9 @@
 
 import typer
 
+from benchllm.cache import FileCache
 from benchllm.cli.listener import ReportListener, RichCliListener
-from benchllm.cli.utils import get_evaluator
+from benchllm.cli.utils import add_cache, get_evaluator
 from benchllm.tester import Tester
 from benchllm.utils import find_files
 
@@ -17,6 +18,7 @@ def run_suite(
     workers: int,
     evaluator_name: str,
     retry_count: int,
+    cache: str,
 ) -> bool:
     files = find_files(file_search_paths)
     if not files:
@@ -45,6 +47,10 @@ def run_suite(
         return True
 
     evaluator = get_evaluator(evaluator_name, model, workers)
+    evaluator = add_cache(cache, evaluator, output_dir.parent / "cache.json")
+
+    cli_listener.set_evaulator(evaluator)
+
     evaluator.add_listener(cli_listener)
     evaluator.add_listener(report_listener)
     evaluator.load(tester.predictions)
 
@@ -0,0 +1,2 @@
+from benchllm.cli.evaluator.interactive import InteractiveEvaluator  # noqa
+from benchllm.cli.evaluator.web import WebEvaluator  # noqa
@@ -0,0 +1,31 @@
+from typing import Optional
+
+import click
+import typer
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class InteractiveEvaluator(Evaluator):
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        header = (
+            f'{typer.style("Does ", bold=True)}'
+            f"{typer.style(prediction.output, fg=typer.colors.BRIGHT_BLUE, bold=True)}"
+            f'{typer.style(" match any of the following expected prompts?", bold=True)}'
+        )
+        typer.echo("")
+        typer.echo(header)
+
+        for i, expected in enumerate(prediction.test.expected, start=1):
+            typer.secho(f"{i}. ", fg=typer.colors.BRIGHT_BLUE, bold=True, nl=False)
+            typer.secho(expected, bold=True)
+
+        options = [str(idx) for idx, _ in enumerate(prediction.test.expected, start=1)] + ["n"]
+
+        prompt_string = f"[{typer.style('matching number', fg=typer.colors.GREEN, bold=True)} or {typer.style('n', fg=typer.colors.RED, bold=True)}]"
+        click_choice = click.Choice(options)
+        response = typer.prompt(prompt_string, default="n", type=click_choice, show_choices=False).lower()
+        if response == "n":
+            return None
+        return Evaluator.Match(prediction=prediction.output, expected=prediction.test.expected[int(response) - 1])
@@ -0,0 +1,47 @@
+import signal
+from typing import Optional
+
+import typer
+from pywebio import session
+from pywebio.input import radio
+from pywebio.output import put_markdown
+
+from benchllm.data_types import Prediction
+from benchllm.evaluator import Evaluator
+
+
+class WebEvaluator(Evaluator):
+    def __init__(self) -> None:
+        super().__init__(workers=1)
+
+        @session.defer_call
+        def on_close() -> None:
+            print("shutting down")
+            typer.secho(
+                f"The evaluation was interrupted. Run bench eval to start again", fg=typer.colors.RED, bold=True
+            )
+            # sys.exit doesn't work here, so we have to raise a signal to kill the process
+            signal.raise_signal(signal.SIGINT)
+
+        put_markdown("# BenchLLM Web Evaluator")
+
+    def evaluate_prediction(self, prediction: Prediction) -> Optional[Evaluator.Match]:
+        test_name = prediction.test.file_path or prediction.test.id
+
+        put_markdown(f"## {test_name}")
+        put_markdown(f"*Question*: `{prediction.test.input}`")
+        put_markdown(f"*Prediction*: `{prediction.output}`")
+
+        table = [["Question:", f"{prediction.test.input}", ""], ["Prediction:", prediction.output], ""]
+        label = f"Question: {prediction.test.input}Prediction: {prediction.output}"
+
+        options: list[dict[str, Optional[int | str]]] = [
+            {"label": expected, "value": idx} for idx, expected in enumerate(prediction.test.expected)
+        ]
+        options.append({"label": "None", "value": None, "selected": True})
+        answer = radio("Pick the matching answer", options=options, required=True)
+
+        if answer and isinstance(answer, int):
+            return Evaluator.Match(prediction=prediction.output, expected=prediction.test.expected[answer])
+        else:
+            return None
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from benchllm.cli.evaluator.interactive import InteractiveEvaluator # noqa`
	`2`	`+from benchllm.cli.evaluator.web import WebEvaluator # noqa`