CPU Benchmark (ONNX-cpp)

November 7, 2023 ยท View on GitHub

Configuration

Data set:

Aishell1 test set , the total audio duration is 36108.919 seconds.

Tools

Install modelscope and funasr

pip3 install torch torchaudio
pip install -U modelscope
pip install -U funasr

Export onnx model

python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True

Building for Linux/Unix

Download onnxruntime

# download an appropriate onnxruntime from https://github.com/microsoft/onnxruntime/releases/tag/v1.14.0
# here we get a copy of onnxruntime for linux 64
wget https://github.com/microsoft/onnxruntime/releases/download/v1.14.0/onnxruntime-linux-x64-1.14.0.tgz
tar -zxvf onnxruntime-linux-x64-1.14.0.tgz

Install openblas

sudo apt-get install libopenblas-dev #ubuntu
# sudo yum -y install openblas-devel #centos

Build runtime

git clone https://github.com/alibaba-damo-academy/FunASR.git && cd funasr/runtime/onnxruntime
mkdir build && cd build
cmake  -DCMAKE_BUILD_TYPE=release .. -DONNXRUNTIME_DIR=/path/to/onnxruntime-linux-x64-1.14.0
make

Paraformer-large

./funasr-onnx-offline-rtf \
    --model-dir    ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
    --quantize  true \
    --wav-path     ./aishell1_test.scp  \
    --thread-num 32

Node: '--quantize false' means fp32, otherwise it will be int8 

Number of Parameter: 220M

Storage size: 880MB

Storage size after int8-quant: 237MB

CER: 1.95%

CER after int8-quant: 1.95%

Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni

concurrent-tasksprocessing time(s)RTFSpeedup Rate
1 (onnx fp32)2129s0.05897417
1 (onnx int8)1020s0.0282635
8 (onnx fp32)273s0.007553132
8 (onnx int8)128s0.003558281
16 (onnx fp32)146s0.00403248
16 (onnx int8)67s0.001868535
32 (onnx fp32)133s0.003672272
32 (onnx int8)64s0.001778562
64 (onnx fp32)136s0.003771265
64 (onnx int8)67s0.001846541
96 (onnx fp32)137s0.003788264
96 (onnx int8)68s0.001875533

Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz 32core-64processor without avx512_vnni

concurrent-tasksprocessing time(s)RTFSpeedup Rate
1 (onnx fp32)2903s0.08040412
1 (onnx int8)2714s0.07516813
8 (onnx fp32)373s0.01032997
8 (onnx int8)340s0.009428106
16 (onnx fp32)189s0.005252190
16 (onnx int8)174s0.004817207
32 (onnx fp32)109s0.00301332
32 (onnx int8)88s0.00245408
64 (onnx fp32)113s0.003129320
64 (onnx int8)79s0.002201454
96 (onnx fp32)115s0.003183314
96 (onnx int8)80s0.002222450

FSMN-VAD + Paraformer-large + CT-Transformer

./funasr-onnx-offline-rtf \
    --model-dir    ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
    --quantize  true \
    --vad-dir   ./damo/speech_fsmn_vad_zh-cn-16k-common-onnx \
    --punc-dir  ./damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx \
    --wav-path     ./aishell1_test.scp  \
    --thread-num 32

Node: '--quantize false' means fp32, otherwise it will be int8 

Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni

concurrent-tasksprocessing time(s)RTFSpeedup Rate
1 (onnx fp32)2134s0.059117
1 (onnx int8)1047s0.02934
8 (onnx fp32)273s0.007557132
8 (onnx int8)132s0.003647274
16 (onnx fp32)147s0.004061246
16 (onnx int8)69s0.001916521
32 (onnx fp32)133s0.003675272
32 (onnx int8)65s0.001786559
64 (onnx fp32)136s0.003767265
64 (onnx int8)67s0.001867535
96 (onnx fp32)137s0.003802262
96 (onnx int8)69s0.001904524

Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz 32core-64processor without avx512_vnni

concurrent-tasksprocessing time(s)RTFSpeedup Rate
1 (onnx fp32)3073s0.085112
1 (onnx int8)2840s0.078713
8 (onnx fp32)389s0.0107993
8 (onnx int8)355s0.0098101
16 (onnx fp32)199s0.005513181
16 (onnx int8)171s0.004784210
32 (onnx fp32)113s0.00314318
32 (onnx int8)92s0.00255391
64 (onnx fp32)115s0.0032312
64 (onnx int8)81s0.002232448
96 (onnx fp32)117s0.003257307
96 (onnx int8)81s0.002258442

FSMN-VAD + Paraformer-large +Ngram + CT-Transformer

./funasr-onnx-offline-rtf \
    --model-dir    ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
    --quantize  true \
    --vad-dir   ./damo/speech_fsmn_vad_zh-cn-16k-common-onnx \
    --punc-dir  ./damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx \
    --lm-dir    ./damo/speech_ngram_lm_zh-cn-ai-wesp-fst \
    --wav-path     ./aishell1_test.scp  \
    --thread-num 32

Node: '--quantize false' means fp32, otherwise it will be int8 

Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni

concurrent-tasksprocessing time(s)RTFSpeedup Rate
1 (onnx fp32)2506s0.069414
1 (onnx int8)1448s0.040125
8 (onnx fp32)326s0.0090110
8 (onnx int8)184s0.0051195
16 (onnx fp32)178s0.0049202
16 (onnx int8)99s0.0027361
32 (onnx fp32)152s0.0042236
32 (onnx int8)85s0.0023422
64 (onnx fp32)157s0.0043228
64 (onnx int8)89s0.0024403
96 (onnx fp32)158s0.0044227
96 (onnx int8)91s0.0025396
# using hotwords
./funasr-onnx-offline-rtf \
    --model-dir    ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
    --quantize  true \
    --vad-dir   ./damo/speech_fsmn_vad_zh-cn-16k-common-onnx \
    --punc-dir  ./damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx \
    --lm-dir    ./damo/speech_ngram_lm_zh-cn-ai-wesp-fst \
    --hotword  hotwords_dev_600.txt \
    --wav-path     ./aishell1_test.scp  \
    --thread-num 32

Node: '--quantize false' means fp32, otherwise it will be int8 

Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni

concurrent-tasksprocessing time(s)RTFSpeedup Rate
1 (onnx fp32)3172s0.087811
1 (onnx int8)2140s0.059216
8 (onnx fp32)412s0.011487
8 (onnx int8)268s0.0074134
16 (onnx fp32)218s0.0060165
16 (onnx int8)140s0.0038257
32 (onnx fp32)183s0.0050196
32 (onnx int8)116s0.0032310
64 (onnx fp32)188s0.0052192
64 (onnx int8)120s0.0033299
96 (onnx fp32)191s0.0052188
96 (onnx int8)122s0.0033294

FSMN-VAD + Paraformer-en + CT-Transformer

./funasr-onnx-offline-rtf \
    --model-dir    ./damo/speech_paraformer-large_asr_nat-en-16k-common-vocab10020-onnx \
    --quantize  true \
    --vad-dir   ./damo/speech_fsmn_vad_zh-cn-16k-common-onnx \
    --punc-dir  ./damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx \
    --wav-path     ./librispeech_test_clean.scp  \
    --thread-num 32

Node: '--quantize false' means fp32, otherwise it will be int8 

Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni

concurrent-tasksprocessing time(s)RTFSpeedup Rate
1 (onnx fp32)1327s0.068215
1 (onnx int8)734s0.037726
8 (onnx fp32)169s0.0087114
8 (onnx int8)94s0.0048205
16 (onnx fp32)89s0.0046217
16 (onnx int8)50s0.0025388
32 (onnx fp32)78s0.0040248
32 (onnx int8)43s0.0022448
64 (onnx fp32)79s0.0041243
64 (onnx int8)44s0.0022438
96 (onnx fp32)80s0.0041240
96 (onnx int8)45s0.0023428