from fastapi import FastAPI import torch import os from llama_cpp import Llama from transformers import AutoModelForCausalLM, AutoTokenizer import requests device = "cpu" access_token = os.getenv("access_token") privateurl = os.getenv("privateurl") tokenizer1 = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") tokenizer2 = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token) tokenizer3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") llm1 = Llama.from_pretrained( repo_id="Qwen/Qwen2-1.5B-Instruct-GGUF", filename="*q8_0.gguf", verbose=False ) llm2 = Llama.from_pretrained( repo_id="NexaAIDev/gemma-2-2b-it-GGUF", filename="*q4_K_S.gguf", verbose=False ) llm3 = Llama.from_pretrained( repo_id="microsoft/Phi-3-mini-4k-instruct-gguf", filename="*q4.gguf", verbose=False ) app = FastAPI() @app.get("/") async def read_root(): return {"Hello": "World!"} def modelResp1(cookie, target, token, prompt): messages = [ {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."}, {"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, {"role": "user", "content": f"{prompt}"} ] text = tokenizer1.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = llm1( text, max_tokens=64, # Generate up to 256 tokens echo=False, # Whether to echo the prompt ) response = output['choices'][0]['text'] headers['Cookie'] = f"{cookie}" payload['token'] = f"{token}" payload['target'] = f"{target}" payload['content'] = response requests.post(privateurl, headers=headers, data=payload) def modelResp2(prompt): messages = [ {"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, {"role": "user", "content": f"{prompt}"} ] text = tokenizer2.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = llm2( text, max_tokens=64, # Generate up to 256 tokens echo=False, # Whether to echo the prompt ) response = output['choices'][0]['text'] return response def modelResp3(prompt): messages = [ {"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."}, {"role": "user", "content": "Who are you?"}, {"role": "assistant", "content": "I am Sia, a small language model created by Sushma."}, {"role": "user", "content": f"{prompt}"} ] text = tokenizer3.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) output = llm2( text, max_tokens=64, # Generate up to 256 tokens echo=False, # Whether to echo the prompt ) response = output['choices'][0]['text'] return response @app.post("/modelapi1") async def modelApi(data: dict): target = data.get("target_id") cookie = data.get("Cookie") token = data.get("token") prompt = data.get("prompt") modelResp1(cookie, target, token, prompt) return {"Hello": "World!"} @app.post("/modelapi2") async def modelApi(data: dict): prompt = data.get("prompt") #response = modelResp2(prompt) return {"Hello": "World!"} @app.post("/modelapi3") async def modelApi1(data: dict): prompt = data.get("prompt") response = modelResp3(prompt) return response headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': '', 'Sec-Ch-Ua': '"Opera";v="95", "Chromium";v="109", "Not;A=Brand";v="24"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"Windows"', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0', 'X-Requested-With': 'XMLHttpRequest' } payload = { 'target': '', 'content': '', 'token': '' }