在Openwebui中对传入的图片进行文字识别后再交给模型处理

github地址: https://github.com/xinhai-ai/imageocr
在传入图片时首先对图像进行文字识别,再将识别结果传给模型处理
使不支持图像的o1-mini也能读图做数学题
效果图:

使用方法

复制imageocr.py的内容
或者复制

import asyncio
import re
from typing import Callable, Awaitable, Any, Optional

import aiohttp
from pydantic import BaseModel, Field


class Filter:
    class Valves(BaseModel):
        priority: int = Field(default=0, description="用于过滤操作的优先级别。")
        OCR_Base_URL: str = Field(
            default="https://api.openai.com", description="LLm OCR API的基础URL。"
        )
        OCR_API_KEY: str = Field(default="", description="API的API密钥。")
        max_retries: int = Field(default=3, description="HTTP请求的最大重试次数。")
        ocr_prompt: str = Field(
            default="Please only recognize and extract the text or data from this image without interpreting, analyzing, or understanding the content. Do not output any additional information. Simply return the recognized text or data content.",
            description="进行OCR识别的提示词",
        )
        model_name: str = Field(default="gemini-1.5-flash-latest", description="用于OCR图像的模型名称。推荐使用gemini系列")

    def __init__(self):
        self.valves = self.Valves()

    async def _perform_ocr(
        self, image: str, event_emitter: Callable[[Any], Awaitable[None]]
    ) -> str:
        """执行OCR识别的内部方法"""
        await event_emitter(
            {
                "type": "status",
                "data": {
                    "description": "✨正在对图像进行文字识别中,请耐心等待...",
                    "done": False,
                },
            }
        )

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.valves.OCR_API_KEY}",
        }
        ocr_body = {
            "model": self.valves.model_name,
            "messages": [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": self.valves.ocr_prompt}],
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": self.valves.ocr_prompt},
                        {
                            "type": "image_url",
                            "image_url": {"url": image, "detail": "high"},
                        }
                    ],
                },
            ],
        }
        url = f"{self.valves.OCR_Base_URL}/v1/chat/completions"

        async with aiohttp.ClientSession() as session:
            for attempt in range(self.valves.max_retries):
                try:
                    async with session.post(
                        url, json=ocr_body, headers=headers
                    ) as response:
                        response.raise_for_status()
                        response_data = await response.json()
                        result = response_data["choices"][0]["message"]["content"]

                        await event_emitter(
                            {
                                "type": "status",
                                "data": {
                                    "description": "🎉识别成功,交由模型进行处理...",
                                    "done": True,
                                },
                            }
                        )

                        return result
                except Exception as e:
                    if attempt == self.valves.max_retries - 1:
                        raise RuntimeError(f"OCR识别失败:{e}")

    async def inlet(
        self,
        body: dict,
        __event_emitter__: Callable[[Any], Awaitable[None]],
        __user__: Optional[dict] = None,
        __model__: Optional[dict] = None,
    ) -> dict:
        messages = body.get("messages", [])

        # 查找图像
        image_info = self._find_image_in_messages(messages)
        if not image_info:
            return body

        message_index, content_index, image = image_info

        # 如果已经是第二轮及以上对话,直接返回
        if (len(messages) // 2) >= 1:
            del messages[message_index]["content"][content_index]
            body["messages"] = messages
            return body

        try:
            # 执行OCR识别
            result = await self._perform_ocr(image, __event_emitter__)

            # 更新消息内容
            messages[message_index]["content"][content_index]["type"] = "text"
            messages[message_index]["content"][content_index].pop("image_url", None)
            messages[message_index]["content"][content_index]["text"] = result
            body["messages"] = messages
        except Exception as e:
            print(f"OCR识别错误: {e}")
            # 可以根据需要进行错误处理

        return body

    def _find_image_in_messages(self, messages):
        """在消息中查找图像"""
        for m_index, message in enumerate(messages):
            if message["role"] == "user" and isinstance(message.get("content"), list):
                for c_index, content in enumerate(message["content"]):
                    if content["type"] == "image_url":
                        return m_index, c_index, content["image_url"]["url"]
        return None

    async def outlet(
        self,
        body: dict,
        __event_emitter__: Callable[[Any], Awaitable[None]],
        __user__: Optional[dict] = None,
        __model__: Optional[dict] = None,
    ) -> dict:
        return body

前往管理员设置界面

点击添加函数



在工作空间新建一个模型

注意

只能在首轮对话时传入一张图片

只有在首轮对话时才会进行文字识别,在之后的对话中,会将传入的图片剔除

特别感谢

在OpenWebUI中使用FLUX绘画(硅基流动) 部分内容来自此函数

16 个赞

使用报错
Cannot use ‘in’ operator to search for ‘detail’ in Internal server error

使用的哪个模型,填的哪个api,试试删去"detail": “high” 看看,我用的gemini-1.5-flash-latest模型,还没试过其他的模型

这玩意可以做成插件用在微信机器人吗?

如果改成tool 会不会好一些?

用pipeline是不是更好一些

感谢大佬教程

4 个赞

平常用的多的模型也就 o1 没有识图功能,写成函数也够用了。当然也欢迎再修改!tieba_087

弄成函数简单的,简单点好…tieba_009

不行,不知道为啥

报了什么错吗。


这样就没后续了

0.4.8版本

函数配置里面用的哪个模型,0.4.8的版本的时候我这边是可以用的…

gemini-2.0-flash-exp
换其他也一样

感谢佬,解决了 :+1:

佬有一个问题,如果open-webui添加了系统级提示词,例如在用户中设置了提示词。则图片识别无法启动。我用O1-preview糊一下,如下能解决

import asyncio
import re
from typing import Callable, Awaitable, Any, Optional
import aiohttp
from pydantic import BaseModel, Field


class Filter:
    class Valves(BaseModel):
        priority: int = Field(default=0, description="用于过滤操作的优先级别。")
        OCR_Base_URL: str = Field(
            default="https://api.openai.com", description="LLm OCR API的基础URL。"
        )
        OCR_API_KEY: str = Field(default="", description="API的API密钥。")
        max_retries: int = Field(default=3, description="HTTP请求的最大重试次数。")
        ocr_prompt: str = Field(
            default="Please only recognize and extract the text or data from this image without interpreting, analyzing, or understanding the content. Do not output any additional information. Simply return the recognized text or data content.",
            description="进行OCR识别的提示词",
        )
        model_name: str = Field(
            default="gemini-1.5-flash-latest",
            description="用于OCR图像的模型名称。推荐使用gemini系列",
        )

    def __init__(self):
        self.valves = self.Valves()

    async def _perform_ocr(
        self, image: str, event_emitter: Callable[[Any], Awaitable[None]]
    ) -> str:
        """执行OCR识别的内部方法"""
        await event_emitter(
            {
                "type": "status",
                "data": {
                    "description": "✨正在对图像进行文字识别中,请耐心等待...",
                    "done": False,
                },
            }
        )
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.valves.OCR_API_KEY}",
        }
        ocr_body = {
            "model": self.valves.model_name,
            "messages": [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": self.valves.ocr_prompt}],
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": self.valves.ocr_prompt},
                        {
                            "type": "image_url",
                            "image_url": {"url": image, "detail": "high"},
                        },
                    ],
                },
            ],
        }
        url = f"{self.valves.OCR_Base_URL}/v1/chat/completions"
        async with aiohttp.ClientSession() as session:
            for attempt in range(self.valves.max_retries):
                try:
                    async with session.post(
                        url, json=ocr_body, headers=headers
                    ) as response:
                        response.raise_for_status()
                        response_data = await response.json()
                        result = response_data["choices"][0]["message"]["content"]
                        await event_emitter(
                            {
                                "type": "status",
                                "data": {
                                    "description": "🎉识别成功,交由模型进行处理...",
                                    "done": True,
                                },
                            }
                        )
                        return result
                except Exception as e:
                    if attempt == self.valves.max_retries - 1:
                        raise RuntimeError(f"OCR识别失败:{e}")

    async def inlet(
        self,
        body: dict,
        __event_emitter__: Callable[[Any], Awaitable[None]],
        user: Optional[dict] = None,
        model: Optional[dict] = None,
    ) -> dict:
        messages = body.get("messages", [])
        # 查找图像
        image_info = self._find_image_in_messages(messages)
        if not image_info:
            return body
        message_index, content_index, image = image_info
        # 统计用户和助手的消息数量(忽略 system prompt)
        user_assistant_messages = [
            msg for msg in messages if msg["role"] in ["user", "assistant"]
        ]
        # 如果已经是第二轮及以上对话,直接返回
        if len(user_assistant_messages) >= 2:
            del messages[message_index]["content"][content_index]
            body["messages"] = messages
            return body
        try:
            # 执行OCR识别
            result = await self._perform_ocr(image, __event_emitter__)
            # 更新消息内容
            messages[message_index]["content"][content_index]["type"] = "text"
            messages[message_index]["content"][content_index].pop("image_url", None)
            messages[message_index]["content"][content_index]["text"] = result
            body["messages"] = messages
        except Exception as e:
            print(f"OCR识别错误: {e}")
            # 可以根据需要进行错误处理
        return body

    def _find_image_in_messages(self, messages):
        """在消息中查找图像"""
        for m_index, message in enumerate(messages):
            if message["role"] == "user" and isinstance(message.get("content"), list):
                for c_index, content in enumerate(message["content"]):
                    if content["type"] == "image_url":
                        return m_index, c_index, content["image_url"]["url"]
        return None

    async def outlet(
        self,
        body: dict,
        __event_emitter__: Callable[[Any], Awaitable[None]],
        user: Optional[dict] = None,
        model: Optional[dict] = None,
    ) -> dict:
        return body

此话题已在最后回复的 30 天后被自动关闭。不再允许新回复。