这里可以让你在本地使用CPU来运行 "PaddleOCR-vl-0.9B",来看看吧,我好久没水帖了

早上还在坐地铁呢,就有佬友分享新模型的帖子了

我一向是热衷于折腾边缘计算小模型的,所以搞了一早上+下午的45分钟,写出一个勉强能用的脚本,让Paddleocr-vl-0.9B能够在CPU上运行,并使用openai api格式输出结果

公司里部分业务最近也在弄OCR,所以程序也优先调整成Dify可输出的格式(我写完以后突然开始犯懒,不想测试curl调用,老精神病了)。就先用Dify上测试的结果来演示吧。

先去下载模型

脚本在这里

(输入的图片过大时会存在进程挂起的问题,所以大家可以继续优化,新的结果回帖给我,我会更新主帖的。老懒狗了,写完就不想优化,想开始摸鱼。靡不有初,鲜克有终这块嗷,大家不要学我

2025/10/17 16:06 已缓解了内存溢出问题,并为CPU提供不同精度的加载途径

import base64
from io import BytesIO
import os
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
import uvicorn
import requests
import time
import json
import gc
import traceback

LOCAL_PATH = "/models/PaddleOCR-VL-0.9B" # replace with your local path

print(f"=== PaddleOCR-VL API 启动中 ===")
print(f"模型路径: {LOCAL_PATH}")

try:
    processor = AutoProcessor.from_pretrained(LOCAL_PATH, trust_remote_code=True, use_fast=True)
    
    # 尝试加载float16,失败则回退到float32
    try:
        print("尝试加载 float16 模型...")
        model = AutoModelForCausalLM.from_pretrained(
            LOCAL_PATH, 
            trust_remote_code=True, 
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True
        ).to("cpu")
        print("使用 float16 精度")
    except Exception as e:
        print(f"float16 加载失败,回退到 float32: {e}")
        model = AutoModelForCausalLM.from_pretrained(
            LOCAL_PATH, 
            trust_remote_code=True, 
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True
        ).to("cpu")
        print("使用 float32 精度")
    
    model.eval()
    model.generation_config.pad_token_id = model.config.pad_token_id
    print("模型加载成功")
except Exception as e:
    print(f"模型加载失败: {e}")
    traceback.print_exc()
    raise

app = FastAPI()

@app.get("/v1/models")
async def list_models():
    return {
        "object": "list",
        "data": [
            {
                "id": "paddleocr-vl",
                "object": "model",
                "created": int(time.time()),
                "owned_by": "custom",
                "permission": [],
                "root": "paddleocr-vl",
                "parent": None,
                "capabilities": {
                    "vision": True,
                    "function_calling": False,
                    "fine_tuning": False
                }
            }
        ]
    }

@app.post("/v1/chat/completions")
async def chat_completions(body: dict):
    try:
        messages = body.get("messages", [])
        if not messages:
            raise HTTPException(status_code=400, detail="请求体中缺少messages字段")

        content = messages[-1].get("content", [])
        if isinstance(content, str):
            text_prompt = content
            image_urls = []
        else:
            text_parts = [c["text"] for c in content if c["type"] == "text"]
            text_prompt = " ".join(text_parts) or "Parse the document."
            image_urls = [c["image_url"]["url"] for c in content if c["type"] == "image_url"]
    except KeyError as e:
        print(f"请求格式错误: {e}")
        traceback.print_exc()
        raise HTTPException(status_code=400, detail=f"请求格式错误: {e}")

    images = []
    for idx, url in enumerate(image_urls):
        img = None
        try:
            if url.startswith("data:"):
                if "," in url:
                    _, b64_data = url.split(",", 1)
                elif url.startswith("data:image/"):
                    b64_data = url.replace("data:image/", "").split(";")[0]
                else:
                    b64_data = url.replace("data:", "")
                
                img_bytes = base64.b64decode(b64_data)
                img = Image.open(BytesIO(img_bytes))
            else:
                response = requests.get(url, timeout=10)
                response.raise_for_status()
                img = Image.open(BytesIO(response.content))
        except requests.exceptions.RequestException as e:
            print(f"图片下载失败: {e}")
            traceback.print_exc()
            raise HTTPException(status_code=400, detail=f"图片下载失败: {e}")
        except Exception as e:
            print(f"图片处理失败: {e}")
            traceback.print_exc()
            raise HTTPException(status_code=400, detail=f"图片处理失败: {e}")

        if img:
            if img.mode != 'RGB':
                img = img.convert('RGB')
            images.append(img)

    image = images[0] if images else None

    chat_messages = [{"role": "user", "content": text_prompt}]
    
    try:
        prompt = processor.tokenizer.apply_chat_template(
            chat_messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
    except Exception as e:
        print(f"Chat template 失败,使用备用格式: {e}")
        cls_token = "<|begin_of_sentence|>"
        if image:
            prompt = f"{cls_token}User: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>{text_prompt}\nAssistant: "
        else:
            prompt = f"{cls_token}User: {text_prompt}\nAssistant: "

    try:
        inputs = processor(text=prompt, images=image, return_tensors="pt").to("cpu")
        
        # processor处理完成后,立即关闭图片释放内存
        for img in images:
            try:
                img.close()
            except:
                pass
    except Exception as e:
        print(f"输入处理失败: {e}")
        traceback.print_exc()
        # 发生异常时也要清理图片
        for img in images:
            try:
                img.close()
            except:
                pass
        raise HTTPException(status_code=500, detail=f"输入处理失败: {e}")

    try:
        max_tokens = body.get("max_tokens", 131072)
        temperature = body.get("temperature", 0.7)
        
        print(f"开始推理 (max_tokens={max_tokens}, temp={temperature})...")
        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=temperature > 0,
                use_cache=True,
                pad_token_id=processor.tokenizer.pad_token_id,
                eos_token_id=processor.tokenizer.eos_token_id,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3,
                top_p=0.9,
                top_k=50
            )
        print("推理完成")
    except Exception as e:
        print(f"模型推理失败: {e}")
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"模型推理失败: {e}")
    
    full_output = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # 保存token计数用于usage统计
    prompt_tokens = len(inputs["input_ids"][0])
    total_tokens = len(outputs[0])
    completion_tokens = total_tokens - prompt_tokens
    
    # 清理推理内存
    del inputs
    del outputs
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    if "Assistant: " in full_output:
        parts = full_output.split("Assistant: ")
        generated = parts[-1].strip() if len(parts) > 1 else full_output
    else:
        generated = full_output.replace(prompt, "").strip()

    stream = body.get("stream", False)
    completion_id = f"chatcmpl-{int(time.time())}"
    created_time = int(time.time())
    
    # 清理图片内存
    if image:
        del image
    if images:
        del images
    
    if stream:
        async def generate_stream():
            words = generated.split()
            chunk_size = 3
            
            for i in range(0, len(words), chunk_size):
                chunk_text = " ".join(words[i:i+chunk_size])
                if i + chunk_size < len(words):
                    chunk_text += " "
                
                chunk = {
                    "id": completion_id,
                    "object": "chat.completion.chunk",
                    "created": created_time,
                    "model": "paddleocr-vl-0.9b",
                    "choices": [{
                        "index": 0,
                        "delta": {
                            "role": "assistant" if i == 0 else None,
                            "content": chunk_text
                        },
                        "finish_reason": None
                    }]
                }
                
                if chunk["choices"][0]["delta"]["role"] is None:
                    del chunk["choices"][0]["delta"]["role"]
                
                yield f"data: {json.dumps(chunk, ensure_ascii=False)}\n\n"
            
            final_chunk = {
                "id": completion_id,
                "object": "chat.completion.chunk",
                "created": created_time,
                "model": "paddleocr-vl-0.9b",
                "choices": [{
                    "index": 0,
                    "delta": {},
                    "finish_reason": "stop"
                }]
            }
            yield f"data: {json.dumps(final_chunk, ensure_ascii=False)}\n\n"
            yield "data: [DONE]\n\n"
        
        return StreamingResponse(generate_stream(), media_type="text/event-stream")
    
    else:
        response = {
            "id": completion_id,
            "object": "chat.completion",
            "created": created_time,
            "model": "paddleocr-vl-0.9b",
            "choices": [{
                "index": 0, 
                "message": {
                    "role": "assistant", 
                    "content": generated
                }, 
                "finish_reason": "stop"
            }],
            "usage": {
                "prompt_tokens": prompt_tokens, 
                "completion_tokens": completion_tokens, 
                "total_tokens": total_tokens
            }
        }
        
        return response

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7777)

测试结果

实测单线程 INTEL(R) XEON(R) PLATINUM 8558P
处理1720*860的图片,输出2200字符无文字丢失,耗费了50秒

70 个赞

感谢大佬的教程!

3 个赞

好快的回帖!大风车!

1 个赞

顶一下佬,爱看

2 个赞

点赞收藏~

2 个赞

感谢佬友分享 :folded_hands:

2 个赞

感谢分享!

2 个赞

太强了,大佬

2 个赞

顶不起来了,凉了,大家不爱学习如何本地部署了 :tieba_087:

1 个赞

大帅哥来了,男宾一位

1 个赞

佬有没有TTS/ASR 的本地部署经验

1 个赞

之前弄过cosyvoice,后面图方便还是用微软的tts,asr的话基本用API代替

1 个赞

这个挂起多少内存?跑起来几代U没压力?

2 个赞

三连三连!

2 个赞

了解了 :tieba_087: cosyvoice 确实不错
edge tts 现在也在用,但是网络不友好。现在在研究 indextts,有位佬弄了个 indextts+vllm 加速的,1.5 的在 4090 RTF 能干到 0.1,0.3-0.1 之间

2 个赞

内存占用主要取决于输入图片的大小,模型本身载入用不了多少
image

程序没有优化好只是通过了可运行测试,现在比较容易溢出

负载的话单核压力会稍微大点,我这只用8558P试了下,还没在其他机器上试过呢

1 个赞

去试试这个,docker部署的,速度上还可以

4090跑tts也太奢侈了,我一般能用CPU的都用CPU :tieba_087:

1 个赞

举报屏蔽加指定始皇回复处理吗 :tieba_087:

1 个赞

挺好,ocr真是好东西。弄好了作用很大!

1 个赞

没想到在有生之年还能见到 Paddle

1 个赞