Whitepaper
Docs
Sign In
Tool
Tool
image descriptions
Tool ID
image_descriptions
Creator
@camille12053jacky
Downloads
383+
use qwen to do image descriptions
Get
README
No README available
Tool Code
Show
from PIL import Image import requests import torch from torchvision import io from typing import Dict from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor class Tools: def __init__(self): self.model_directory = "E:/OllamaUse/" self.model = Qwen2VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2-VL-2B-Instruct", local_files_only=True, cache_dir=self.model_directory, torch_dtype=torch.bfloat16, ) self.processor = AutoProcessor.from_pretrained( "Qwen/Qwen2-VL-2B-Instruct", local_files_only=True, cache_dir=self.model_directory, max_pixels=300 * 28 * 28, ) # Load the model in half-precision on the available device(s) def describe_image(self, image_path: str): image = Image.open(image_path) conversation = [ { "role": "user", "content": [ { "type": "image", }, { "type": "text", "text": "这张图的红色框框里面包含什么内容?我只要红色框框里面的内容", }, ], } ] text_prompt = self.processor.apply_chat_template( conversation, add_generation_prompt=True ) inputs = self.processor( text=[text_prompt], images=[image], padding=True, return_tensors="pt" ) inputs = inputs.to("cpu") # Inference: Generation of the output output_ids = self.model.generate(**inputs, max_new_tokens=128) generated_ids = [ output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids) ] output_text = self.processor.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) return output_text tool = Tools()