chore(research): Adds project for evaluating models with lm-eval.

gugarosa · gugarosa · commit 53be8c14b1c9 · 2023-01-17T14:37:22.000-03:00
diff --git a/research/harness/.gitignore b/research/harness/.gitignore
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+# Cache files
+cache/
+**/*.db
diff --git a/research/harness/README.md b/research/harness/README.md
@@ -0,0 +1,28 @@
+# Harness
+
+## Installation
+
+To install Harness, run the following commands in your command line:
+
+```shell
+conda create -n harness python=3.8
+conda activate harness
+
+pip install -e .
+```
+
+## Evaluating with LM-Evaluation-Harness (lm-eval)
+
+To evaluate your model with LM-Evaluation-Harness, run the following command:
+
+```shell
+python evaluate_with_lm_eval.py --help
+```
+
+This will give you a list of options and arguments that can be passed to the script to evaluate your model. For example:
+
+```shell
+python evaluate_with_lm_eval.py gpt2 gpt2 --tasks cb,copa
+```
+
+This will evaluate a pre-trained GPT-2 from Hugging Face's Hub, using the `gpt2` pre-trained tokenizer on two SuperGLUE tasks: CommitmentBank and Choice of Plausible Alternatives.
diff --git a/research/harness/evaluate_with_lm_eval.py b/research/harness/evaluate_with_lm_eval.py
@@ -0,0 +1,123 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import argparse
+import json
+
+from harness.lm_eval_evaluator import evaluate_wrapper
+from harness.lm_eval_hf_model import HFEvalModel
+from harness.lm_eval_utils import MultiChoice, pattern_match
+from lm_eval.evaluator import make_table
+from lm_eval.tasks import ALL_TASKS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Evaluates pre-trained models using lm-eval package.")
+
+    parser.add_argument("pre_trained_model_path", type=str, help="Path to the pre-trained model file.")
+
+    parser.add_argument(
+        "hub_tokenizer_path",
+        type=str,
+        help="Name or path to the Hugging Face hub's tokenizer.",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--tasks",
+        choices=MultiChoice(ALL_TASKS),
+        type=str,
+        default=None,
+        help="Tasks to be evaluated (separated by comma), e.g., `wsc,cb,copa`.",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output_path",
+        type=str,
+        default=None,
+        help="Path to the saved outputs.",
+    )
+
+    parser.add_argument(
+        "-ns",
+        "--n_few_shot_samples",
+        type=int,
+        default=0,
+        help="Number of few-shot samples.",
+    )
+
+    parser.add_argument(
+        "-ls",
+        "--limit_samples",
+        type=int,
+        default=None,
+        help="Limit the number of samples.",
+    )
+
+    parser.add_argument(
+        "-nc",
+        "--no_cache",
+        action="store_true",
+        help="Whether to not store predictions in a cache database.",
+    )
+
+    parser.add_argument(
+        "-dnp",
+        "--decontamination_ngrams_path",
+        type=str,
+        default=None,
+        help="Path to the de-contamination n-grams file.",
+    )
+
+    parser.add_argument(
+        "-ddp",
+        "--description_dict_path",
+        type=str,
+        default=None,
+        help="Path to the description dictionary file.",
+    )
+
+    parser.add_argument(
+        "-ci",
+        "--check_integrity",
+        action="store_true",
+        help="Whether to check integrity of tasks.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if args.limit_samples:
+        print("Warning: --limit_samples should only be used for testing.")
+
+    task_names = ALL_TASKS if args.tasks is None else pattern_match(args.tasks.split(","), ALL_TASKS)
+    print(f"Selected Tasks: {task_names}")
+
+    description_dict = {}
+    if args.description_dict_path:
+        with open(args.description_dict_path, "r") as f:
+            description_dict = json.load(f)
+
+    model = HFEvalModel(args.pre_trained_model_path, args.hub_tokenizer_path)
+
+    outputs = evaluate_wrapper(
+        model,
+        task_names,
+        num_fewshot=args.n_few_shot_samples,
+        no_cache=args.no_cache,
+        limit=args.limit_samples,
+        description_dict=description_dict,
+        check_integrity=args.check_integrity,
+        decontamination_ngrams_path=args.decontamination_ngrams_path,
+    )
+
+    output_json = json.dumps(outputs, indent=2)
+    if args.output_path:
+        with open(args.output_path, "w") as f:
+            f.write(output_json)
+
+    print(make_table(outputs))
diff --git a/research/harness/harness/__init__.py b/research/harness/harness/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
diff --git a/research/harness/harness/lm_eval_evaluator.py b/research/harness/harness/lm_eval_evaluator.py
@@ -0,0 +1,54 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import random
+from typing import List, Optional
+
+import numpy as np
+from harness.lm_eval_hf_model import HFEvalModel
+from lm_eval.base import CachingLM
+from lm_eval.evaluator import evaluate
+from lm_eval.tasks import get_task_dict
+from lm_eval.utils import run_task_tests
+
+
+def evaluate_wrapper(
+    model: HFEvalModel,
+    tasks: List[str],
+    num_fewshot: Optional[int] = 0,
+    no_cache: Optional[bool] = False,
+    limit: Optional[int] = None,
+    bootstrap_iters: Optional[int] = 100000,
+    description_dict: Optional[str] = None,
+    check_integrity: Optional[bool] = False,
+    decontamination_ngrams_path: Optional[str] = None,
+):
+    random.seed(1234)
+    np.random.seed(1234)
+
+    if not no_cache:
+        model = CachingLM(model, "cache/hf-eval-model.db")
+
+    if check_integrity:
+        run_task_tests(task_list=tasks)
+
+    task_dict = get_task_dict(tasks)
+    results = evaluate(
+        lm=model,
+        task_dict=task_dict,
+        num_fewshot=num_fewshot,
+        limit=limit,
+        bootstrap_iters=bootstrap_iters,
+        description_dict=description_dict,
+        decontamination_ngrams_path=decontamination_ngrams_path,
+    )
+
+    results["config"] = {
+        "num_fewshot": num_fewshot,
+        "no_cache": no_cache,
+        "limit": limit,
+        "bootstrap_iters": bootstrap_iters,
+        "description_dict": description_dict,
+    }
+
+    return results
diff --git a/research/harness/harness/lm_eval_hf_model.py b/research/harness/harness/lm_eval_hf_model.py
@@ -0,0 +1,58 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from typing import List
+
+import torch
+from lm_eval.base import BaseLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+class HFEvalModel(BaseLM):
+    def __init__(self, pre_trained_model_path: str, hub_tokenizer_path: str):
+        super().__init__()
+
+        self._device = torch.device("cpu")
+        if torch.cuda.is_available():
+            self._device = torch.device("cuda")
+
+        self.model = AutoModelForCausalLM.from_pretrained(pre_trained_model_path)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(hub_tokenizer_path)
+        self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    @property
+    def eot_token_id(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self) -> int:
+        try:
+            return self.model.config.n_ctx
+        except AttributeError:
+            return self.model.config.max_position_embeddings
+
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+
+    @property
+    def batch_size(self) -> int:
+        return 1
+
+    @property
+    def device(self) -> torch.device:
+        return self._device
+
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string, add_special_tokens=False)
+
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)
+
+    def _model_call(self, inps: torch.Tensor) -> torch.Tensor:
+        with torch.no_grad():
+            return self.model(inps)[0]
+
+    def _model_generate(self, context: str, max_length: int, eos_token_id: int) -> str:
+        return self.model.generate(context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
diff --git a/research/harness/harness/lm_eval_utils.py b/research/harness/harness/lm_eval_utils.py
@@ -0,0 +1,30 @@
+# Copyright: https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py
+
+import fnmatch
+from typing import List
+
+
+def pattern_match(patterns: List[str], source_list: List[str]) -> List[str]:
+    task_names = set()
+
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.add(matching)
+
+    return list(task_names)
+
+
+class MultiChoice:
+    def __init__(self, choices: List[str]):
+        self.choices = choices
+
+    def __contains__(self, values: List[str]) -> bool:
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                return False
+
+        return True
+
+    def __iter__(self) -> str:
+        for choice in self.choices:
+            yield choice
diff --git a/research/harness/requirements.txt b/research/harness/requirements.txt
@@ -0,0 +1,2 @@
+lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git
+transformers
diff --git a/research/harness/setup.py b/research/harness/setup.py
@@ -0,0 +1,17 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from setuptools import find_packages, setup
+
+install_requires = [r.rstrip() for r in open("requirements.txt", "r").readlines()]
+
+setup(
+    name="harness",
+    version="0.1",
+    author="Microsoft",
+    url="https://github.com/microsoft/archai/research/harness",
+    license="MIT",
+    install_requires=install_requires,
+    packages=find_packages(),
+    include_package_data=True,
+)
diff --git a/research/transformer_plus_plus/train_model.py b/research/transformer_plus_plus/train_model.py
@@ -7,15 +7,13 @@
 import argparse
 import os
 
-from archai.nlp.file_utils import check_available_checkpoint
-
 from transformer_plus_plus.training.experiment import Experiment
 
+from archai.common.file_utils import check_available_checkpoint
+
 
 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Runs Transformer++ experiment."
-    )
+    parser = argparse.ArgumentParser(description="Runs Transformer++ experiment.")
 
     parser.add_argument(
         "arch_config_file",
@@ -45,11 +43,7 @@ def parse_args() -> argparse.Namespace:
 if __name__ == "__main__":
     args = parse_args()
 
-    experiment = Experiment(
-        args.arch_config_file,
-        args.experiment_config_file,
-        output_dir=args.output_dir
-    )
+    experiment = Experiment(args.arch_config_file, args.experiment_config_file, output_dir=args.output_dir)
 
     # Asserts that resume_from_checkpoint will be a valid boolean
     # when pre-trained checkpoints exists

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Copyright (c) Microsoft Corporation.`
	`2`	`+# Licensed under the MIT license.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git`
	`2`	`+transformers`