Find answers from the community

Updated 3 months ago

@Logan M i am trying to us llava instead

i am trying to us llava instead of GPT-4V,
Plain Text
REPLICATE_API_URL = "https://api.replicate.ai/v1/chat/completions"


def compose_payload(images: np.ndarray, prompt: str) -> dict:
    text_content = {
        "type": "text",
        "text": prompt
    }
    image_content = [
        {
            "type": "image",
            "image": encode_image_to_base64(image=image)
        }
        for image in images
    ]
    return {
        "model":  REPLICATE_MULTI_MODAL_LLM_MODELS["llava-13b"],
        "messages": [
            {
                "role": "user",
                "content": [text_content] + image_content
            }
        ],
        "max_tokens": 300
    }

def prompt_image(api_key: str, images: list, prompt: str) -> list:
    REPLICATE_API_URL = "https://api.replicate.ai/v1/chat/completions"  # Insert your Replicate API URL here
    headers = compose_headers(api_key=api_key)
    res = []
    for image in images:
        payload = compose_payload(images=[image], prompt=prompt)
        response = requests.post(url=REPLICATE_API_URL, headers=headers, json=payload).json()

        if 'error' in response:
            raise ValueError(response['error']['message'])

        # Assuming response format needs to be adjusted based on the actual response structure
        res.append({
            "response": response['choices'][0]['message']['content'],
            "image": str(image.image_path),  # Adjust this according to your image structure
        })

    return res
L
a
2 comments
yes i have tried it but would love to do for a video by splitting into frames and then extracting ddescription... doing as shown in the link can make it complex... any workarounds?
Add a reply
Sign up and join the conversation on Discord