for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images #1737

pseudotensor · 2024-07-13T04:52:12Z

import cv2
from openai import OpenAI

client = OpenAI(base_url='http://<ip>:80/v1')
model="OpenGVLab/InternVL2-26B"
#client = OpenAI(base_url='http://<ip>:80/v1')
#model = 'OpenGVLab/InternVL-Chat-V1-5'

prompt = """<response_instructions>
- Act as a keen observer with a sharp eye for detail.
- Analyze the content within the images.
- Provide insights based on your observations.
- Avoid making up facts.
- Finally, according to our chat history, above documents, above figure captions, or given images, generate a well-structured response.
</response_instructions>
What tower do you see in the image?
"""

from PIL import Image
import base64
import requests
from io import BytesIO


# The encoding function I linked previously - but we actually don't use this function in the API server
def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
    """encode image to base64 format."""

    buffered = BytesIO()
    if format == 'JPEG':
        image = image.convert('RGB')
    image.save(buffered, format)
    return base64.b64encode(buffered.getvalue()).decode('utf-8')


# This is what we use in the API server to load the base64 string to image
def load_image_from_base64(image: str):
    """Load image from base64 format."""
    return Image.open(BytesIO(base64.b64decode(image)))


image1 = '/tmp/image_file_764ae7bd-6b02-4ffb-b9d6-83e754c30952.jpeg'
image2 = '/tmp/image_file_1bfb88ea-a545-4b1f-a31f-051dbb90a378.jpeg'
image3 = '/tmp/image_file_ac5589e7-92a3-470f-a933-40d6bad38052.jpeg'

#from PIL import Image


def remove_padding(image_path, output_path, background_color=(255, 255, 255)):
    # Read the image
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold to get a binary image
    _, binary = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY)

    # Invert the binary image
    inverted_binary = cv2.bitwise_not(binary)

    # Find contours
    contours, _ = cv2.findContours(inverted_binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Get the bounding box of the largest contour
    x, y, w, h = cv2.boundingRect(contours[0])
    for contour in contours:
        x1, y1, w1, h1 = cv2.boundingRect(contour)
        if w1 * h1 > w * h:
            x, y, w, h = x1, y1, w1, h1

    # Crop the image to the bounding box
    cropped_image = image[y:y+h, x:x+w]

    # Save the cropped image
    cv2.imwrite(output_path, cropped_image)


# Example usage
if False:
    ext = 'b.jpg'
    remove_padding(image1, image1 + ext)
    remove_padding(image2, image2 + ext)
    remove_padding(image3, image3 + ext)
else:
    ext = ''

image1_64 = base64.b64encode(open(image1 + ext, 'rb').read()).decode('utf-8')
image2_64 = base64.b64encode(open(image2 + ext, 'rb').read()).decode('utf-8')
image3_64 = base64.b64encode(open(image3 + ext, 'rb').read()).decode('utf-8')

system_prompt = "You are h2oGPTe, an expert question-answering AI system created by H2O.ai that performs like GPT-4 by OpenAI."

messages = [
    #{'role': 'system', 'content': system_prompt},
    {
        'role': 'user',
        'content': [
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image1_64,
                }
             },
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image2_64,
                }
             },
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image3_64,
                }
             },
            {'type': 'text', 'text': prompt},
        ],
    }
]

response = client.chat.completions.create(
    model=model,
    messages=messages,
    max_tokens=300,
    temperature=0.0,
)

print(response.choices[0])

gives:

The image does not show a tower. Instead, it shows two separate items:\n\n1. A receipt from a shopping store.\n2. A cake with a message congratulating Kate and Duke on their upcoming arrival.\n\nIf you have any specific questions about these items, please let me know!

The text was updated successfully, but these errors were encountered:

pseudotensor · 2024-07-13T04:52:39Z

pseudotensor self-assigned this Jul 13, 2024

pseudotensor added the type/feature Feature request label Jul 13, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images #1737

for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images #1737

for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images #1737

for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images #1737

Comments