Hello there, I am an automation developer and don't have much experience with AI.

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

Update: I got it to work <div class="highlight highlight-source-python notranslate

Convert this into an API about llama2_local HOT 2 CLOSED

thisserand commented on July 30, 2024

Convert this into an API

from llama2_local.

Comments (2)

thisserand commented on July 30, 2024 1

@morpheuslord Yes, of course! Feel free to use the code in your project! :-) Happy to be a contributor :)

from llama2_local.

morpheuslord commented on July 30, 2024

Update: I got it to work

import os
import fire
from enum import Enum
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from transformers import TextIteratorStreamer
from llama_chat_format import format_to_llama_chat_style
from flask import Flask, request, jsonify


class Model_Type(Enum):
    gptq = 1
    ggml = 2
    full_precision = 3


def get_model_type(model_name):
    if "gptq" in model_name.lower():
        return Model_Type.gptq
    elif "ggml" in model_name.lower():
        return Model_Type.ggml
    else:
        return Model_Type.full_precision


def create_folder_if_not_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)


def initialize_gpu_model_and_tokenizer(model_name, model_type):
    if model_type == Model_Type.gptq:
        model = AutoGPTQForCausalLM.from_quantized(
            model_name, device_map="auto", use_safetensors=True,
            use_triton=False)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_name, device_map="auto", token=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)
    return model, tokenizer


def init_auto_model_and_tokenizer(model_name, model_type, file_name=None):
    model_type = get_model_type(model_name)

    if Model_Type.ggml == model_type:
        models_folder = "./models"
        create_folder_if_not_exists(models_folder)
        file_path = hf_hub_download(
            repo_id=model_name, filename=file_name, local_dir=models_folder)
        model = Llama(file_path, n_ctx=4096)
        tokenizer = None
    else:
        model, tokenizer = initialize_gpu_model_and_tokenizer(
            model_name, model_type=model_type)
    return model, tokenizer


app = Flask(__name__)


@app.route('/api/chatbot', methods=['POST'])
def chatbot_api():
    data = request.json
    user_message = data['user_message']
    model_name = data['model_name']
    file_name = data.get('file_name')

    is_chat_model = 'chat' in model_name.lower()
    model_type = get_model_type(model_name)

    if model_type == Model_Type.ggml:
        assert file_name is not None, """
        When model_name is provided for a GGML quantized model, file_name argument must also be provided."""

    model, tokenizer = init_auto_model_and_tokenizer(
        model_name, model_type, file_name)

    if is_chat_model:
        instruction = format_to_llama_chat_style([[user_message, None]])
    else:
        instruction = user_message

    history = [[user_message, None]]

    response = generate_response(
        model, tokenizer, instruction, history, model_type)
    return jsonify({'bot_response': response})


def generate_response(model, tokenizer, instruction, history, model_type):
    response = ""

    kwargs = dict(temperature=0.6, top_p=0.9)
    if model_type == Model_Type.ggml:
        kwargs["max_tokens"] = 512
        for chunk in model(prompt=instruction, stream=True, **kwargs):
            token = chunk["choices"][0]["text"]
            response += token

    else:
        streamer = TextIteratorStreamer(
            tokenizer, skip_prompt=True, Timeout=5)
        inputs = tokenizer(instruction, return_tensors="pt").to(model.device)
        kwargs["max_new_tokens"] = 512
        kwargs["input_ids"] = inputs["input_ids"]
        kwargs["streamer"] = streamer
        thread = Thread(target=model.generate, kwargs=kwargs)
        thread.start()

        for token in streamer:
            response += token

    return response


def run_app(port):
    app.run(port=port)


if __name__ == '__main__':
    fire.Fire(run_app(5000))

I wanted to ask if I can use this code in my project mentioned above. I wanted to add you as a contributor for it.

from llama2_local.

Convert this into an API about llama2_local HOT 2 CLOSED

Comments (2)

Related Issues (12)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs