Skip to content

Commit 53be8c1

Browse files
committed
chore(research): Adds project for evaluating models with lm-eval.
1 parent 0728620 commit 53be8c1

10 files changed

Lines changed: 324 additions & 10 deletions

File tree

research/harness/.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
# Cache files
5+
cache/
6+
**/*.db

research/harness/README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Harness
2+
3+
## Installation
4+
5+
To install Harness, run the following commands in your command line:
6+
7+
```shell
8+
conda create -n harness python=3.8
9+
conda activate harness
10+
11+
pip install -e .
12+
```
13+
14+
## Evaluating with LM-Evaluation-Harness (lm-eval)
15+
16+
To evaluate your model with LM-Evaluation-Harness, run the following command:
17+
18+
```shell
19+
python evaluate_with_lm_eval.py --help
20+
```
21+
22+
This will give you a list of options and arguments that can be passed to the script to evaluate your model. For example:
23+
24+
```shell
25+
python evaluate_with_lm_eval.py gpt2 gpt2 --tasks cb,copa
26+
```
27+
28+
This will evaluate a pre-trained GPT-2 from Hugging Face's Hub, using the `gpt2` pre-trained tokenizer on two SuperGLUE tasks: CommitmentBank and Choice of Plausible Alternatives.
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
import argparse
5+
import json
6+
7+
from harness.lm_eval_evaluator import evaluate_wrapper
8+
from harness.lm_eval_hf_model import HFEvalModel
9+
from harness.lm_eval_utils import MultiChoice, pattern_match
10+
from lm_eval.evaluator import make_table
11+
from lm_eval.tasks import ALL_TASKS
12+
13+
14+
def parse_args():
15+
parser = argparse.ArgumentParser(description="Evaluates pre-trained models using lm-eval package.")
16+
17+
parser.add_argument("pre_trained_model_path", type=str, help="Path to the pre-trained model file.")
18+
19+
parser.add_argument(
20+
"hub_tokenizer_path",
21+
type=str,
22+
help="Name or path to the Hugging Face hub's tokenizer.",
23+
)
24+
25+
parser.add_argument(
26+
"-t",
27+
"--tasks",
28+
choices=MultiChoice(ALL_TASKS),
29+
type=str,
30+
default=None,
31+
help="Tasks to be evaluated (separated by comma), e.g., `wsc,cb,copa`.",
32+
)
33+
34+
parser.add_argument(
35+
"-o",
36+
"--output_path",
37+
type=str,
38+
default=None,
39+
help="Path to the saved outputs.",
40+
)
41+
42+
parser.add_argument(
43+
"-ns",
44+
"--n_few_shot_samples",
45+
type=int,
46+
default=0,
47+
help="Number of few-shot samples.",
48+
)
49+
50+
parser.add_argument(
51+
"-ls",
52+
"--limit_samples",
53+
type=int,
54+
default=None,
55+
help="Limit the number of samples.",
56+
)
57+
58+
parser.add_argument(
59+
"-nc",
60+
"--no_cache",
61+
action="store_true",
62+
help="Whether to not store predictions in a cache database.",
63+
)
64+
65+
parser.add_argument(
66+
"-dnp",
67+
"--decontamination_ngrams_path",
68+
type=str,
69+
default=None,
70+
help="Path to the de-contamination n-grams file.",
71+
)
72+
73+
parser.add_argument(
74+
"-ddp",
75+
"--description_dict_path",
76+
type=str,
77+
default=None,
78+
help="Path to the description dictionary file.",
79+
)
80+
81+
parser.add_argument(
82+
"-ci",
83+
"--check_integrity",
84+
action="store_true",
85+
help="Whether to check integrity of tasks.",
86+
)
87+
88+
return parser.parse_args()
89+
90+
91+
if __name__ == "__main__":
92+
args = parse_args()
93+
94+
if args.limit_samples:
95+
print("Warning: --limit_samples should only be used for testing.")
96+
97+
task_names = ALL_TASKS if args.tasks is None else pattern_match(args.tasks.split(","), ALL_TASKS)
98+
print(f"Selected Tasks: {task_names}")
99+
100+
description_dict = {}
101+
if args.description_dict_path:
102+
with open(args.description_dict_path, "r") as f:
103+
description_dict = json.load(f)
104+
105+
model = HFEvalModel(args.pre_trained_model_path, args.hub_tokenizer_path)
106+
107+
outputs = evaluate_wrapper(
108+
model,
109+
task_names,
110+
num_fewshot=args.n_few_shot_samples,
111+
no_cache=args.no_cache,
112+
limit=args.limit_samples,
113+
description_dict=description_dict,
114+
check_integrity=args.check_integrity,
115+
decontamination_ngrams_path=args.decontamination_ngrams_path,
116+
)
117+
118+
output_json = json.dumps(outputs, indent=2)
119+
if args.output_path:
120+
with open(args.output_path, "w") as f:
121+
f.write(output_json)
122+
123+
print(make_table(outputs))
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
import random
5+
from typing import List, Optional
6+
7+
import numpy as np
8+
from harness.lm_eval_hf_model import HFEvalModel
9+
from lm_eval.base import CachingLM
10+
from lm_eval.evaluator import evaluate
11+
from lm_eval.tasks import get_task_dict
12+
from lm_eval.utils import run_task_tests
13+
14+
15+
def evaluate_wrapper(
16+
model: HFEvalModel,
17+
tasks: List[str],
18+
num_fewshot: Optional[int] = 0,
19+
no_cache: Optional[bool] = False,
20+
limit: Optional[int] = None,
21+
bootstrap_iters: Optional[int] = 100000,
22+
description_dict: Optional[str] = None,
23+
check_integrity: Optional[bool] = False,
24+
decontamination_ngrams_path: Optional[str] = None,
25+
):
26+
random.seed(1234)
27+
np.random.seed(1234)
28+
29+
if not no_cache:
30+
model = CachingLM(model, "cache/hf-eval-model.db")
31+
32+
if check_integrity:
33+
run_task_tests(task_list=tasks)
34+
35+
task_dict = get_task_dict(tasks)
36+
results = evaluate(
37+
lm=model,
38+
task_dict=task_dict,
39+
num_fewshot=num_fewshot,
40+
limit=limit,
41+
bootstrap_iters=bootstrap_iters,
42+
description_dict=description_dict,
43+
decontamination_ngrams_path=decontamination_ngrams_path,
44+
)
45+
46+
results["config"] = {
47+
"num_fewshot": num_fewshot,
48+
"no_cache": no_cache,
49+
"limit": limit,
50+
"bootstrap_iters": bootstrap_iters,
51+
"description_dict": description_dict,
52+
}
53+
54+
return results
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
from typing import List
5+
6+
import torch
7+
from lm_eval.base import BaseLM
8+
from transformers import AutoModelForCausalLM, AutoTokenizer
9+
10+
11+
class HFEvalModel(BaseLM):
12+
def __init__(self, pre_trained_model_path: str, hub_tokenizer_path: str):
13+
super().__init__()
14+
15+
self._device = torch.device("cpu")
16+
if torch.cuda.is_available():
17+
self._device = torch.device("cuda")
18+
19+
self.model = AutoModelForCausalLM.from_pretrained(pre_trained_model_path)
20+
21+
self.tokenizer = AutoTokenizer.from_pretrained(hub_tokenizer_path)
22+
self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
23+
24+
@property
25+
def eot_token_id(self) -> int:
26+
return self.tokenizer.eos_token_id
27+
28+
@property
29+
def max_length(self) -> int:
30+
try:
31+
return self.model.config.n_ctx
32+
except AttributeError:
33+
return self.model.config.max_position_embeddings
34+
35+
@property
36+
def max_gen_toks(self) -> int:
37+
return 256
38+
39+
@property
40+
def batch_size(self) -> int:
41+
return 1
42+
43+
@property
44+
def device(self) -> torch.device:
45+
return self._device
46+
47+
def tok_encode(self, string: str) -> List[int]:
48+
return self.tokenizer.encode(string, add_special_tokens=False)
49+
50+
def tok_decode(self, tokens: List[int]) -> str:
51+
return self.tokenizer.decode(tokens)
52+
53+
def _model_call(self, inps: torch.Tensor) -> torch.Tensor:
54+
with torch.no_grad():
55+
return self.model(inps)[0]
56+
57+
def _model_generate(self, context: str, max_length: int, eos_token_id: int) -> str:
58+
return self.model.generate(context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright: https://github.com/EleutherAI/lm-evaluation-harness/blob/master/main.py
2+
3+
import fnmatch
4+
from typing import List
5+
6+
7+
def pattern_match(patterns: List[str], source_list: List[str]) -> List[str]:
8+
task_names = set()
9+
10+
for pattern in patterns:
11+
for matching in fnmatch.filter(source_list, pattern):
12+
task_names.add(matching)
13+
14+
return list(task_names)
15+
16+
17+
class MultiChoice:
18+
def __init__(self, choices: List[str]):
19+
self.choices = choices
20+
21+
def __contains__(self, values: List[str]) -> bool:
22+
for value in values.split(","):
23+
if len(fnmatch.filter(self.choices, value)) == 0:
24+
return False
25+
26+
return True
27+
28+
def __iter__(self) -> str:
29+
for choice in self.choices:
30+
yield choice

research/harness/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git
2+
transformers

research/harness/setup.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
from setuptools import find_packages, setup
5+
6+
install_requires = [r.rstrip() for r in open("requirements.txt", "r").readlines()]
7+
8+
setup(
9+
name="harness",
10+
version="0.1",
11+
author="Microsoft",
12+
url="https://github.com/microsoft/archai/research/harness",
13+
license="MIT",
14+
install_requires=install_requires,
15+
packages=find_packages(),
16+
include_package_data=True,
17+
)

research/transformer_plus_plus/train_model.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,13 @@
77
import argparse
88
import os
99

10-
from archai.nlp.file_utils import check_available_checkpoint
11-
1210
from transformer_plus_plus.training.experiment import Experiment
1311

12+
from archai.common.file_utils import check_available_checkpoint
13+
1414

1515
def parse_args() -> argparse.Namespace:
16-
parser = argparse.ArgumentParser(
17-
description="Runs Transformer++ experiment."
18-
)
16+
parser = argparse.ArgumentParser(description="Runs Transformer++ experiment.")
1917

2018
parser.add_argument(
2119
"arch_config_file",
@@ -45,11 +43,7 @@ def parse_args() -> argparse.Namespace:
4543
if __name__ == "__main__":
4644
args = parse_args()
4745

48-
experiment = Experiment(
49-
args.arch_config_file,
50-
args.experiment_config_file,
51-
output_dir=args.output_dir
52-
)
46+
experiment = Experiment(args.arch_config_file, args.experiment_config_file, output_dir=args.output_dir)
5347

5448
# Asserts that resume_from_checkpoint will be a valid boolean
5549
# when pre-trained checkpoints exists

0 commit comments

Comments
 (0)