import gradio as gr
#from transformers import AutoModelForCausalLM, AutoProcessor

# Load the model and processor
model_id = "microsoft/Phi-3-vision-128k-instruct"
#model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2')
#processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

# Define the function to generate text
def generate_text(image, prompt):
    # Process the input
    inputs = ""

    # Generate the text
    generation_args = {
        "max_new_tokens": 500,
        "temperature": 0.0,
        "do_sample": False,
    }
    
    return image + prompt

# Create the Gradio application
gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Image(type="pil"),
        gr.Textbox(label="Prompt")
    ],
    outputs=gr.Textbox(),
    title="Phi-3-Vision Model",
    description="Generate text based on an image and prompt using the Phi-3-Vision model."
).launch(share=True,show_error=True)