Skip to content

Commit

Permalink
Merge branch 'main' into del-convert-tokenizer-flag
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov authored Mar 28, 2024
2 parents 9041fe5 + 382d00f commit 277225c
Show file tree
Hide file tree
Showing 44 changed files with 2,162 additions and 1,808 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/test_inc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,13 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install cmake
pip install py-cpuinfo
pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
pip install .[neural-compressor,diffusers,tests]
pip install intel-extension-for-pytorch
pip install intel-extension-for-pytorch==2.1.100
pip install intel-extension-for-transformers==1.3.2
pip install peft
- name: Test with Pytest
run: |
pytest tests/neural_compressor/
6 changes: 6 additions & 0 deletions .github/workflows/test_openvino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,9 @@ jobs:
- name: Test with Pytest
run: |
pytest tests/openvino/ --ignore test_modeling_basic
- name: Test openvino-nightly
run: |
pip uninstall -y openvino
pip install openvino-nightly
python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov
46 changes: 46 additions & 0 deletions .github/workflows/test_openvino_examples.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: OpenVINO - Examples Test

on:
workflow_dispatch:
schedule:
- cron: 0 1 * * 1 # run weekly: every Monday at 1am
push:
paths:
- '.github/workflows/test_openvino_examples.yml'
- 'examples/openvino/*'
pull_request:
paths:
- '.github/workflows/test_openvino_examples.yml'
- 'examples/openvino/*'

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
build:
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.10"]

runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v2
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
pip install optimum[openvino] jstyleson nncf pytest
pip install -r examples/openvino/audio-classification/requirements.txt
pip install -r examples/openvino/image-classification/requirements.txt
pip install -r examples/openvino/question-answering/requirements.txt
pip install -r examples/openvino/text-classification/requirements.txt
- name: Test examples
run: |
python -m pytest examples/openvino/test_examples.py
2 changes: 2 additions & 0 deletions .github/workflows/test_openvino_notebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,7 @@ jobs:

- name: Test with Pytest
run: |
sed -i 's/NUM_TRAIN_ITEMS = 600/NUM_TRAIN_ITEMS = 10/' notebooks/openvino/question_answering_quantization.ipynb
sed -i 's/# %pip install/%pip install/' notebooks/openvino/optimum_openvino_inference.ipynb
python -m pytest --nbval-lax notebooks/openvino/optimum_openvino_inference.ipynb notebooks/openvino/question_answering_quantization.ipynb
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
# Run code quality checks
style_check:
black --check .
ruff .
ruff check .

style:
black .
ruff . --fix
ruff check . --fix

# Run tests for the library
test:
Expand Down
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,26 @@ Quantization aware training (QAT) is applied in order to simulate the effects of
You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/index).


## IPEX
To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
```diff
from transformers import AutoTokenizer, pipeline
- from transformers import AutoModelForCausalLM
+ from optimum.intel import IPEXModelForCausalLM


model_id = "gpt2"
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
results = pipe("He's a dreadful magician and")

```

For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction).


## Running the examples

Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference.
Expand Down
2 changes: 1 addition & 1 deletion examples/neural_compressor/language-modeling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,4 @@ respectively `dynamic`, `static`, `weight_only` or `aware_training`.

The flag `--verify_loading` can be passed along to verify that the resulting quantized model can be loaded correctly.

> **_Note:_** `weight_only` quantization_approach requires neural-compressor >= 2.3
> **_Note:_** `weight_only` quantization_approach requires `neural-compressor` >= 2.3 and `intel-extension-for-transformers` >= 1.3.
2 changes: 2 additions & 0 deletions examples/neural_compressor/language-modeling/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ torch >= 1.9
datasets >= 1.8.0
sentencepiece != 0.1.92
protobuf
intel-extension-for-transformers >= 1.3
peft
95 changes: 66 additions & 29 deletions examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@
from transformers.utils.versions import require_version

from optimum.intel.neural_compressor import INCModelForCausalLM, INCQuantizer, INCTrainer
from optimum.intel.utils.import_utils import (
INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR,
is_intel_extension_for_transformers_available,
)


if is_intel_extension_for_transformers_available():
from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig


os.environ["CUDA_VISIBLE_DEVICES"] = ""
Expand Down Expand Up @@ -143,7 +151,9 @@ class OptimizationArguments:
)
quantization_approach: str = field(
default="dynamic",
metadata={"help": "Quantization approach. Supported approach are static, dynamic and aware_training."},
metadata={
"help": "Quantization approach. Supported approach are static, dynamic aware_training and weight_only."
},
)
smooth_quant: bool = field(
default=False,
Expand Down Expand Up @@ -196,9 +206,13 @@ class OptimizationArguments:
default=False,
metadata={"help": "Whether or not to verify the loading of the quantized model."},
)
bits: int = field(
default=8,
metadata={"help": "Bits for weight only quantization, 1-8 bits."},
bits: str = field(
default="4",
metadata={"help": "Bits number of weight for weight only quantization. 1~8 bits."},
)
weight_dtype: str = field(
default="int4_clip",
metadata={"help": "weight dtype for weight only quantization."},
)
group_size: int = field(
default=-1,
Expand All @@ -214,10 +228,29 @@ class OptimizationArguments:
)
quantization_methodology: str = field(
default="RTN",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
)
damp_percent: float = field(
default=0.01,
metadata={
"help": "Quantization methodology for weight only quantization. Choose from 'RTN', 'AWQ' and 'GPTQ'."
"help": "Percentage of Hessian's diagonal values average, which will be added to Hessian's diagonal to increase numerical stability, used for GPTQ quantization"
},
)
gptq_block_size: int = field(
default=128,
metadata={"help": "Block size. sub weight matrix size to run GPTQ."},
)
num_calibration_samples: int = field(
default=128, metadata={"help": "Number of examples to use for the GPTQ calibration step."}
)
use_max_length: bool = field(
default=False,
metadata={"help": "Set all sequence length to be same length of args.gptq_pad_max_length"},
)
pad_max_length: int = field(
default=2048,
metadata={"help": "Calibration dataset sequence max length, this should align with your model config"},
)


@dataclass
Expand Down Expand Up @@ -625,26 +658,30 @@ def compute_metrics(eval_preds):
else:
recipes = {}
if optim_args.quantization_approach == "weight_only":
op_type_dict = {
".*": {
"weight": {
"bits": optim_args.bits,
"group_size": optim_args.group_size,
"scheme": optim_args.weight_only_scheme,
"algorithm": optim_args.quantization_methodology,
},
},
}
if not is_intel_extension_for_transformers_available():
raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
if optim_args.apply_pruning or optim_args.apply_distillation:
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
if optim_args.quantization_methodology == "GPTQ":
gptq_args = {
"pad_max_length": block_size,
algorithm_args = {
"act_order": False,
"percdamp": optim_args.damp_percent,
"block_size": optim_args.gptq_block_size,
"nsamples": optim_args.num_calibration_samples,
"use_max_length": optim_args.use_max_length,
"pad_max_length": optim_args.pad_max_length,
}
recipes.update({"gptq_args": gptq_args})
quantization_config = WeightOnlyQuantConfig(
weight_dtype=optim_args.weight_dtype,
group_size=optim_args.group_size,
scheme=optim_args.weight_only_scheme,
algorithm=optim_args.quantization_methodology,
algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
)
else:
op_type_dict = {}
quantization_config = PostTrainingQuantConfig(
approach=optim_args.quantization_approach, op_type_dict=op_type_dict, recipes=recipes
)
quantization_config = PostTrainingQuantConfig(
approach=optim_args.quantization_approach, recipes=recipes
)

if optim_args.apply_pruning:
if optim_args.end_step is None:
Expand Down Expand Up @@ -732,15 +769,15 @@ def compute_metrics(eval_preds):
quantizer.quantize(
quantization_config=quantization_config,
save_directory=training_args.output_dir,
calibration_dataset=train_dataset
if optim_args.quantization_approach in ["static", "weight_only"]
else None,
batch_size=1 # batch_size > 1 for GPTQ is WIP
if optim_args.quantization_approach == "weight_only" and optim_args.quantization_methodology == "GPTQ"
else training_args.per_device_train_batch_size,
weight_only=True if optim_args.quantization_approach == "weight_only" else False,
calibration_dataset=(
train_dataset if optim_args.quantization_approach in ["static", "weight_only"] else None
),
batch_size=(
1 if optim_args.quantization_approach == "weight_only" else training_args.per_device_train_batch_size
),
)
trainer.model = quantizer._quantized_model

if optim_args.apply_quantization and optim_args.verify_loading:
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
tokens = tokenizer("This is a sample input", return_tensors="pt")
Expand Down
4 changes: 1 addition & 3 deletions examples/neural_compressor/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,9 +368,7 @@ def calibration_fn(p_model):

args.length = adjust_length_to_model(
args.length,
max_sequence_length=model.config.max_position_embeddings
if hasattr(model.config, "max_position_embeddings")
else 0,
max_sequence_length=getattr(model.config, "max_position_embeddings", 0),
)
logger.info(args)

Expand Down
Loading

0 comments on commit 277225c

Please sign in to comment.