import matplotlib.pyplot as plt
import cv2
import numpy as np
from IPython.display import display, Image
import threading

running = False
cap = None
display_handle = None

def view():
    global running, cap, display_handle
    display_handle = display(None, display_id=True)
    while running:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.flip(frame, 1) 
        _, frame_encoded = cv2.imencode('.jpeg', frame)
        display_handle.update(Image(data=frame_encoded.tobytes()))
    cap.release()
    display_handle.update(None)

def start_stream():
    global running, cap
    if not running:
        running = True
        cap = cv2.VideoCapture(0)
        thread = threading.Thread(target=view)
        thread.start()

def stop_stream():
    global running
    running = False

def capture_image():
    cap = cv2.VideoCapture(0)
    ret, frame = cap.read()
    cap.release()
    if ret:
        filename = 'captured_image.jpg'
        cv2.imwrite(filename, frame)
        print(f"Image saved as {filename}")
        # Display the captured image
        img_display = cv2.imread(filename)
        plt.imshow(cv2.cvtColor(img_display, cv2.COLOR_BGR2RGB))
        plt.axis('off')
        plt.show()

start_stream()  # To start the video stream

None

capture_image() # To capture an image from the webcam

Image saved as captured_image.jpg

#stop_stream()   # To stop the video stream

# Adapted from OpenAI's Vision example 
from openai import OpenAI
import base64
import requests

# Point to the local server
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

# Ask the user for a path on the filesystem:
path = input("Enter a local filepath to an image: ")

# Read the image and encode it to base64:
base64_image = ""
try:
  image = open(path.replace("'", ""), "rb").read()
  base64_image = base64.b64encode(image).decode("utf-8")
except:
  print("Couldn't read the image. Make sure the path is correct and the file exists.")
  exit()

completion = client.chat.completions.create(
  model="cjpais/llava-1.6-mistral-7b-gguf",
  messages=[
    {
      "role": "system",
      "content": "This is a chat between a user and an assistant. The assistant is helping the user to describe an image.",
    },
    {
      "role": "user",
      "content": [
        {"type": "text", "text": "What does this hand gesture image show?"},
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{base64_image}"
          },
        },
      ],
    }
  ],
  max_tokens=1000,
  stream=True
)

for chunk in completion:
  if chunk.choices[0].delta.content:
    print(chunk.choices[0].delta.content, end="", flush=True)

GestureME¶

Function to open the camera stream, capture image and save, close the stream¶