Try it live at on Gradio (New page) :
How it works ? As easy as 1,2,3
- Load our 4bit mode more from huggingface , using the standard Rollama2 tokenizer.
from transformers import pipeline
pipe = pipeline("text-generation", model="intelpen/OpenLLM-Ro-RoLLama2-7b-Instruct-v1-4Bit-BB", tokenizer = "OpenLLM-Ro/RoLlama2-7b-Instruct-v1")</p>
2. Use your pipeline to instruct the model to act as a chatbot.
def query_model(
system_message,
user_message,
temperature=0.7,
max_length=1024
):
start_time = time()
user_message = "Question: " + user_message + " Answer:"
messages = [
{"role": "system", "content": system_message},
{"role": "user", "content": user_message},
]
prompt = pipe.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
terminators = pipe.tokenizer.eos_token_id, pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
sequences = pipe(
prompt,
do_sample=True,
top_p=0.9,
temperature=temperature,
eos_token_id=terminators,
max_new_tokens=max_length,
return_full_text=False,
pad_token_id=pipe.model.config.eos_token_id
)
answer = sequences[0]['generated_text']
end_time = time()
ttime = f"Total time: {round(end_time-start_time, 2)} sec."
3. Finally Create your chatbox with Gradio
import gradio as gr
from time import time
default_system = """
You are an AI assistant designed to answer simple questions.
Please restrict your answer to the exact question asked
"""
def chat_with_model(message, history):
if len(history) == 0:
system_message = default_system
else:
system_message = default_system + f" and to the history provided ''.join{history}"
return query_model(system_message, message)
gr.ChatInterface(chat_with_model).launch(share = True)