Image description with llamacpp Tool • Open WebUI Community

# High-Performance Image Description via llama.cpp # ============================================================================== # --- Python Library Requirements --- # This tool requires the 'pydantic' library, which is typically already # installed with OpenWebUI. No other special Python libraries are needed # because all heavy lifting is done by the external llama.cpp binary. import os import subprocess from pydantic import Field # ============================================================================== # IMPORTANT: USER SETUP REQUIRED # ============================================================================== # # 1. Install llama.cpp: # You must have llama.cpp compiled on your system. # Follow the instructions at https://github.com/ggerganov/llama.cpp # # 2. Download a Vision Model (GGUF): # This tool requires a pre-downloaded multimodal vision model in GGUF format, # along with its corresponding multimodal projector (mmproj) file. # - Example Model: Qwen2.5-VL-3B-Instruct-Q8_0.gguf # - Example Projector: Qwen2.5-VL-3B-Instruct.mmproj-fp16.gguf # # 3. UPDATE THE PATHS BELOW: # You MUST update the following three constant variables to match the # paths on your own server. # # ============================================================================== # --- Configuration Constants --- # --- STEP 1: Update this to the location of your compiled llama-mtmd-cli binary --- LLAMA_MTMD_CLI = "/home/your_user/AI/llama_cpp/llama.cpp/build/bin/llama-mtmd-cli" # --- STEP 2: Update this to the path of your downloaded GGUF vision model --- MODEL_PATH = "/path/to/your/gguf_model_file/Qwen2.5-VL-3B-Instruct.Q8_0.gguf" # --- STEP 3: Update this to the path of your downloaded mmproj file for the model --- MMPROJ_PATH = "/path/to/your/mmproj_gguf_file/Qwen2.5-VL-3B-Instruct.mmproj-fp16.gguf" # --- OpenWebUI Tool Class Definition --- # This class provides the structure that OpenWebUI needs to understand the tool. class Tools: def describe_image_with_llamacpp( self, image_path: str = Field( ..., description="The full, local file path to the image to be described." ), ) -> str: """ Analyzes the visual contents of an image from a file path using the fast llama.cpp backend. """ try: print(f"Describing image with llama.cpp: {image_path}") # --- The "Ultimate Prompt" --- # This prompt gives the model a clear persona ("expert image analyst"), # sets strict rules ("never assume," "do not ask follow-up questions"), # and provides a powerful context ("You are helping a blind user") # to ensure the description is detailed, objective, and helpful. full_prompt = "You are an expert image analyst. You never assume anything. If any text is present you transcribe it accurately. If no text is present in the image you describe its contents as it is. You do not ask follow-up questions. You are helping a blind user. Please describe the contents of this image in detail to the blind user." # --- Building the Command --- # This list contains all the arguments passed to the llama.cpp binary. # These flags have been tested and optimized for quality and stability on a CPU. cli_cmd = [ LLAMA_MTMD_CLI, "-m", MODEL_PATH, "--mmproj", MMPROJ_PATH, "--image", image_path, "--prompt", full_prompt, "--temp", "0", # For deterministic, focused output. "--threads", "6", # Reduced from max to keep the UI responsive. "--cpu-strict", "1", # Ensures execution on specified thread count. "--prio-batch", "3", # A performance tuning flag. "--ctx-size", "8192", # Context size for the model. "--mlock", # Locks the model in memory for faster access. "--repeat-penalty", "1.3", # Prevents the model from getting stuck in loops. ] # --- Executing the Tool --- # The 'subprocess.run' command executes the llama.cpp binary in a # separate process. This is the key to preventing OpenWebUI from freezing. # The timeout ensures the process doesn't run forever if it gets stuck. result = subprocess.run( cli_cmd, capture_output=True, text=True, timeout=900 # Generous 15-minute timeout. ) # Check for errors from the subprocess itself. The error messages from # llama.cpp are printed to 'stderr'. if result.returncode != 0: error_message = f"llama-mtmd-cli failed: {result.stderr.strip()}" print(error_message) return error_message # The model's description is the standard output ('stdout'). output_text = result.stdout.strip() print(f"Generated description: {output_text}") return output_text except FileNotFoundError: return f"Error: The file could not be found at the path: {image_path}" except subprocess.TimeoutExpired: return "Error: The image description process took longer than the 15-minute timeout." except Exception as e: error_message = f"An unexpected error occurred: {e}" print(error_message) return error_message