I found that the inference speed in Torch(converted by <a href="https://github.c

I'm not able to reproduce your case, I use the following code: <div class="snippet

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

This is a known <a href="https://github.com/pytorch/pytorch/issues/107503" data-hoverc

TorchScript inference slower than default torch model about djl HOT 4 CLOSED

zaobao commented on June 15, 2024

TorchScript inference slower than default torch model

from djl.

Comments (4)

zaobao commented on June 15, 2024

Both in 'cpu' and 'cuda', the .pt model takes 4-5 times as long as the source model.

from djl.

frankfliu commented on June 15, 2024

I'm not able to reproduce your case, I use the following code:

import os.path
import time

import torch
from sentence_transformers import CrossEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer


def main():
    pairs = [['what is panda?', 'hi'],
             ['what is panda?',
              'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]

    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)

    model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
    model.eval()

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    model_file = "bge-reranker-v2-m3.pt"
    if os.path.exists(model_file):
        traced_model = torch.jit.load(model_file)
        traced_model.eval()
    else:
        traced_model = torch.jit.trace(model, (input_ids, attention_mask),
                                       strict=False)
        traced_model.save(model_file)

    # warmup
    with torch.no_grad():
        traced_model(input_ids, attention_mask)

    start_time = time.time()

    with torch.no_grad():
        for _ in range(10):
            traced_model(input_ids, attention_mask)

    execution_time = time.time() - start_time
    print("traced model: ", execution_time)

    # warmup
    with torch.no_grad():
        model(**inputs)

    start_time = time.time()
    with torch.no_grad():
        for _ in range(10):
            model(**inputs)

    execution_time = time.time() - start_time
    print("huggingface model: ", execution_time)

    model = CrossEncoder("BAAI/bge-reranker-v2-m3", max_length=512)
    model.predict(pairs)

    start_time = time.time()
    for _ in range(10):
        model.predict(pairs)

    execution_time = time.time() - start_time
    print("CrossEncoder: ", execution_time)


if __name__ == '__main__':
    main()

traced model:  2.593564033508301
huggingface model:  2.4144911766052246
CrossEncoder:  2.5312209129333496

from djl.

zaobao commented on June 15, 2024

@frankfliu I modified your script to enable it running on CUDA but got an error message

import os.path
import time

import torch
from sentence_transformers import CrossEncoder
from transformers import AutoModelForSequenceClassification, AutoTokenizer


def main():
    pairs = [['what is panda?', 'hi'],
             ['what is panda?',
              'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]

    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)

    model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3').to('cuda')
    model.eval()

    input_ids = inputs["input_ids"].to('cuda')
    attention_mask = inputs["attention_mask"].to('cuda')
    inputs = inputs.to('cuda')

    model_file = "bge-reranker-v2-m3.pt"

    traced_model = torch.jit.trace(model, (input_ids, attention_mask),
                                       strict=False)
    traced_model.save(model_file)

    # warmup
    with torch.no_grad():
        traced_model(input_ids, attention_mask)

    start_time = time.time()

    with torch.no_grad():
        for _ in range(10):
            traced_model(input_ids, attention_mask)

    execution_time = time.time() - start_time
    print("traced model: ", execution_time)

    # warmup
    with torch.no_grad():
        model(**inputs)

    start_time = time.time()
    with torch.no_grad():
        for _ in range(10):
            model(**inputs)

    execution_time = time.time() - start_time
    print("huggingface model: ", execution_time)

    model = CrossEncoder("BAAI/bge-reranker-v2-m3", max_length=512, device='cpu')
    model.predict(pairs)

    start_time = time.time()
    for _ in range(10):
        model.predict(pairs)

    execution_time = time.time() - start_time
    print("CrossEncoder: ", execution_time)


if __name__ == '__main__':
    main()

RuntimeError: default_program(24): error: extra text after expected end of number
          aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
                                                                                                           ^

default_program(28): error: extra text after expected end of number
      aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
                                                                                                                     ^

2 errors detected in the compilation of "default_program".

nvrtc compilation failed: 

#define NAN __int_as_float(0x7fffffff)
#define POS_INFINITY __int_as_float(0x7f800000)
#define NEG_INFINITY __int_as_float(0xff800000)


template<typename T>
__device__ T maximum(T a, T b) {
  return isnan(a) ? a : (a > b ? a : b);
}

template<typename T>
__device__ T minimum(T a, T b) {
  return isnan(a) ? a : (a < b ? a : b);
}

extern "C" __global__
void fused_mul_div_add(float* tattention_scores_1, float* tv_, float* aten_add, float* aten_mul) {
{
if (blockIdx.x<1ll ? 1 : 0) {
if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<86ll ? 1 : 0) {
if (blockIdx.x<1ll ? 1 : 0) {
        float v = __ldg(tv_ + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
        aten_mul[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v * -3.402823466385289e+38.f;
      }    }  }if ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)<59168ll ? 1 : 0) {
    float v_1 = __ldg(tattention_scores_1 + (long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x));
    float v_2 = __ldg(tv_ + ((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) % 43ll + 43ll * (((long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)) / 29584ll));
    aten_add[(long long)(threadIdx.x) + 512ll * (long long)(blockIdx.x)] = v_1 / 8.f + v_2 * -3.402823466385289e+38.f;
  }}
}

Environment

OS: Linux

nvidia-cuda-runtime-cu12 12.1.105
nvidia-cudnn-cu12 8.9.2.26
torch 2.3.0

from djl.

frankfliu commented on June 15, 2024

This is a known bug in torchscript in PyTorch 2.x, Please try PyTorch 1.13.1 in GPU.

You best option to run text-embedding is to convert it to onnx: see: https://github.com/deepjavalibrary/djl/blob/master/extensions/tokenizers/src/main/python/djl_converter/huggingface_converter.py#L63

from djl.

TorchScript inference slower than default torch model about djl HOT 4 CLOSED

Comments (4)

Environment

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs