set IPEX_LLM_LAST_LM_HEAD=1 as default (#11885)
This commit is contained in:
parent
8c5c7f32dd
commit
0236de3ac2
6 changed files with 19 additions and 4 deletions
|
|
@ -18,6 +18,7 @@ bash run.sh
|
|||
```
|
||||
+ `run.sh`
|
||||
```shell
|
||||
export IPEX_LLM_LAST_LM_HEAD=0
|
||||
python eval.py \
|
||||
--model_path "path to model" \
|
||||
--eval_type validation \
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
export IPEX_LLM_LAST_LM_HEAD=0
|
||||
|
||||
python eval.py \
|
||||
--model_path "path to model" \
|
||||
--eval_type validation \
|
||||
|
|
|
|||
|
|
@ -15,15 +15,21 @@ pip install -e .
|
|||
run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evaluations easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
|
||||
|
||||
### Evaluation on CPU
|
||||
```python
|
||||
```bash
|
||||
export IPEX_LLM_LAST_LM_HEAD=0
|
||||
|
||||
python run_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
|
||||
```
|
||||
### Evaluation on Intel GPU
|
||||
```python
|
||||
```bash
|
||||
export IPEX_LLM_LAST_LM_HEAD=0
|
||||
|
||||
python run_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
|
||||
```
|
||||
### Evaluation using multiple Intel GPU
|
||||
```python
|
||||
```bash
|
||||
export IPEX_LLM_LAST_LM_HEAD=0
|
||||
|
||||
python run_multi_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu:0,2,3 --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
|
||||
```
|
||||
Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks.
|
||||
|
|
|
|||
|
|
@ -12,6 +12,11 @@ This is a required step on Linux for APT or offline installed oneAPI. Skip this
|
|||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
Please set IPEX_LLM_LAST_LM_HEAD=0 to disable the last_lm_head optimization.
|
||||
```bash
|
||||
export IPEX_LLM_LAST_LM_HEAD=0
|
||||
```
|
||||
|
||||
## PPL Evaluation
|
||||
### 1. Run on Wikitext
|
||||
An example to run perplexity on [wikitext](https://paperswithcode.com/dataset/wikitext-2):
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ pip install datasets evaluate soundfile librosa jiwer
|
|||
|
||||
## Run
|
||||
```bash
|
||||
export IPEX_LLM_LAST_LM_HEAD=0
|
||||
python run_whisper.py --model_path /path/to/model --data_type other --device cpu
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -403,7 +403,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
|
|||
optimize_lm_head = (
|
||||
is_lm_head(name, model_config, out_features)
|
||||
and (
|
||||
os.environ.get("IPEX_LLM_LAST_LM_HEAD", "0") == "1"
|
||||
(not os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) == "0")
|
||||
or os.environ.get("IPEX_LLM_LOW_MEM", "0") == "1"
|
||||
and getattr(model_config, "model_type", "") in ["gptj", "llama", "qwen2"]
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in a new issue