🌋【GCP刨土系列】Document AI 带坐标的 OCR 识别

Qiner · 2025 年2 月 27 日 05:36

(\ _ /)
( ･-･)
/っ就是第 2 个东西，第 3 个不支持图片识别就不折腾了。

官网：https://console.cloud.google.com/ai/document-ai/processor-library

价格不重要，有赠金的时候才折腾，没赠金就嫖别的渠道.jpg

总而言之鼠标点点创建就行，会给一个 API 接口。

然后直接上源码：

DocumentaiFormExtractor.py

# pip install google-cloud-documentai
# pip install grpcio==1.50.0  # Windows 7 只能兼容到这个版本。
# pip install tqdm  # 用于进度条显示

# 导入必要的库
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore
import tkinter as tk
from tkinter import filedialog
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

# 设置代理
os.environ.update({'http_proxy': 'http://127.0.0.1:1081', 'https_proxy': 'http://127.0.0.1:1081'})

# 设置服务帐户密钥文件路径
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "(你的鉴权文件).json" 

# 配置信息
project_id = "(这里改成你端点的)"  # 你的 GCP 项目 ID
location = "us"  # 处理器位置，可以是 "us" 或 "eu"
processor_id = "(这里改成你端点的)"  # 你的 Form Parser 处理器ID
max_workers = 5  # 最大并行线程数，可根据需要调整

def process_single_document(file_path, client, processor_name):
    """处理单个文档的函数"""
    try:
        # 读取文件内容到内存
        with open(file_path, "rb") as image:
            image_content = image.read()

        # 根据文件扩展名确定 mime_type
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == ".pdf":
            mime_type = "application/pdf"
        elif file_extension in [".jpg", ".jpeg"]:
            mime_type = "image/jpeg"
        elif file_extension == ".png":
            mime_type = "image/png"
        elif file_extension == ".gif":
            mime_type = "image/gif"
        elif file_extension in [".tif", ".tiff"]:
            mime_type = "image/tiff"
        elif file_extension == ".bmp":
            mime_type = "image/bmp"
        else:
            raise ValueError(f"不支持的文件格式: {file_extension}")

        # 加载二进制数据
        raw_document = documentai.RawDocument(
            content=image_content,
            mime_type=mime_type,
        )

        # 配置处理请求
        request = documentai.ProcessRequest(name=processor_name, raw_document=raw_document)

        # 调用 Document AI 服务处理文档
        result = client.process_document(request=request)
        document = result.document

        # 获取所选文件的目录和文件名（不含扩展名）
        file_dir = os.path.dirname(file_path)
        file_name = os.path.splitext(os.path.basename(file_path))[0]

        # 构建输出文件的完整路径（JSON 格式）
        output_file_path = os.path.join(file_dir, f"{file_name}.json")

        # 将识别的表单数据写入到输出文件（JSON 格式）
        with open(output_file_path, "w", encoding="utf-8") as output_file:
            output_file.write(documentai.Document.to_json(document)) # 使用 to_json 方法

        return True, file_path, output_file_path
    except Exception as e:
        return False, file_path, str(e)

def batch_process_documents():
    """批量处理文档的主函数"""
    # 初始化 Document AI 客户端
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    # 创建文件选择对话框
    root = tk.Tk()
    root.withdraw()  # 隐藏主窗口

    # 弹出文件选择对话框，允许多选, 并添加更多文件类型
    file_paths = filedialog.askopenfilenames(
        title="选择多个文件",
        filetypes=[
            ("所有支持的格式", "*.pdf;*.jpg;*.jpeg;*.png;*.gif;*.tif;*.tiff;*.bmp"),
            ("PDF 文件", "*.pdf"),
            ("JPEG 图片", "*.jpg;*.jpeg"),
            ("PNG 图片", "*.png"),
            ("GIF 图片", "*.gif"),
            ("TIFF 图片", "*.tif;*.tiff"),
            ("BMP 图片", "*.bmp"),
        ]
    )

    # 如果用户取消了文件选择，则退出程序
    if not file_paths or len(file_paths) == 0:
        print("未选择文件，程序退出。")
        return

    total_files = len(file_paths)
    print(f"已选择 {total_files} 个文件，开始处理...")

    # 创建进度条
    progress_bar = tqdm(total=total_files, desc="处理进度", unit="文件")

    # 使用线程池并行处理文件
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # 提交所有任务
        future_to_file = {
            executor.submit(process_single_document, file_path, client, processor_name): file_path
            for file_path in file_paths
        }

        # 处理完成的任务
        for future in as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                success, path, result = future.result()
                if success:
                    results.append((path, result, True))
                else:
                    results.append((path, result, False))
            except Exception as e:
                results.append((file_path, str(e), False))

            # 更新进度条
            progress_bar.update(1)

    # 关闭进度条
    progress_bar.close()

    # 显示处理结果统计
    success_count = sum(1 for _, _, success in results if success)
    failed_count = total_files - success_count

    print(f"\n处理完成！成功: {success_count} 个文件，失败: {failed_count} 个文件")

    # 如果有失败的文件，显示详情
    if failed_count > 0:
        print("\n失败文件列表:")
        for path, error, success in results:
            if not success:
                print(f"- {os.path.basename(path)}: {error}")

    # 显示所有成功处理的文件路径
    if success_count > 0:
        print("\n成功处理的文件:")
        for path, output, success in results:
            if success:
                print(f"- {os.path.basename(path)} -> {os.path.basename(output)}")

if __name__ == "__main__":
    batch_process_documents()

这个跑完会返回一个 json，因为包含了原图 base64 所以比较巨大。

然后这个是转成 html 预览的脚本

DocumentaiFormJsonViewer.py

import json
import tkinter as tk
from tkinter import filedialog
import os
from google.cloud import documentai_v1 as documentai
from PIL import Image
import io
import base64

def get_bounding_box_str(entity):
    """Helper function to format bounding box vertices."""
    try:
        if entity.bounding_poly and entity.bounding_poly.normalized_vertices:
            vertices = entity.bounding_poly.normalized_vertices
            return ", ".join([f"({v.x:.3f}, {v.y:.3f})" for v in vertices])
        else:
            return "N/A"
    except AttributeError:
        return "N/A"

def convert_json_to_html(json_file_path, font_size=12, image_opacity=0.5, show_visual_elements=False):
    """
    Converts Document AI JSON to HTML with layout preservation.

    Args:
        json_file_path: Path to the Document AI JSON file.
        font_size:  The font size (in pixels) to use for the text.
        image_opacity: The opacity of the background image (0.0 to 1.0).
        show_visual_elements: Whether to show visual elements (like checkboxes).
    """
    try:
        with open(json_file_path, 'rb') as f:
            json_data = f.read()
        document = documentai.Document.from_json(json_data)

    except Exception as e:
        print(f"Error loading document: {e}")
        return

    html_output = f"""
    <!DOCTYPE html>
    <html><head>
        <title>Document AI Output</title>
        <style>
            .page-container {{ position: relative; border: 1px solid black; }}
            /* 动态宽度，不截断文本 */
            .positioned-element {{ position: absolute; font-size: {font_size}px; }}
            .image-element {{ position: absolute; opacity: {image_opacity}; }}
            .checkbox {{ position: absolute; width: 12px; height: 12px; border: 1px solid gray; }}
            .checked {{ background-color: black; }}
            /* 视觉元素样式 */
            .visual-element {{ position: absolute; border: 1px dashed blue; width: 12px; height: 12px;}}
            .bbox {{ color: blue; font-size: smaller; }}
        </style>
    </head><body>
    """

    for page_number, page in enumerate(document.pages):
        width = int(page.dimension.width)
        height = int(page.dimension.height)

        html_output += f'<div class="page-container" style="width: {width}px; height: {height}px;">'

        # 1. 图像 (放在最底层)
        if page.image:
            try:
                image_data = page.image.content
                image = Image.open(io.BytesIO(image_data))
                buffered = io.BytesIO()
                image.save(buffered, format="PNG")
                img_str = base64.b64encode(buffered.getvalue()).decode()
                html_output += f'<img class="image-element" src="data:image/png;base64,{img_str}" style="left: 0px; top: 0px; width: {width}px; height: {height}px; z-index: 1; pointer-events: none;" disabled>'
            except Exception as e:
                print(f"Error processing image: {e}")


        # 2. 文本块
        for block in page.blocks:
            bbox = block.layout.bounding_poly.normalized_vertices
            if bbox:
                x1, y1 = int(bbox[0].x * width), int(bbox[0].y * height)
                x2, y2 = int(bbox[2].x * width), int(bbox[2].y * height)
                text = get_text(block.layout, document)
                html_output += f'<div class="positioned-element" style="left: {x1}px; top: {y1}px;  z-index: 3;">{text}</div>'

        # 3. 表单字段
        for field in page.form_fields:
            # 字段名
            if field.field_name:
                name_bbox = field.field_name.bounding_poly.normalized_vertices
                name_text = get_text(field.field_name, document) or ""
                if name_bbox:
                    x1, y1 = int(name_bbox[0].x * width), int(name_bbox[0].y * height)
                    x2, y2 = int(name_bbox[2].x * width), int(name_bbox[2].y * height)
                    html_output += f'<div class="positioned-element" style="left: {x1}px; top: {y1}px;  font-weight: bold; z-index: 3;">{name_text.strip()}</div>'

            # 字段值
            if field.field_value:
                value_bbox = field.field_value.bounding_poly.normalized_vertices
                value_text = get_text(field.field_value, document) or ""

                if value_bbox:
                    x1, y1 = int(value_bbox[0].x * width), int(value_bbox[0].y * height)
                    x2, y2 = int(value_bbox[2].x * width), int(value_bbox[2].y * height)
                    html_output += f'<div class="positioned-element" style="left: {x1}px; top: {y1}px; z-index: 3;">{value_text.strip()}</div>'

        # 4. 视觉元素 (可选显示)
        if show_visual_elements:
            for visual_element in page.visual_elements:
                bbox = visual_element.layout.bounding_poly.normalized_vertices
                if bbox:
                    x1, y1 = int(bbox[0].x * width), int(bbox[0].y * height)
                    x2, y2 = int(bbox[2].x * width), int(bbox[2].y * height)
                    # 为所有视觉元素添加通用样式, 并设置较小的尺寸
                    html_output += f'<div class="visual-element" style="left: {x1}px; top: {y1}px; z-index: 2;"></div>'

        html_output += "</div>"

    html_output += "</body></html>"

    base_name = os.path.splitext(json_file_path)[0]
    html_file_path = base_name + ".html"
    with open(html_file_path, "w", encoding="utf-8") as f:
        f.write(html_output)
    print(f"HTML output saved to: {html_file_path}")

def get_text(entity, document):
    """Extracts text from an entity, handling cases where text_anchor might be None."""
    text = ""
    if hasattr(entity, 'text_anchor') and entity.text_anchor:
        if entity.text_anchor.text_segments:
            for segment in entity.text_anchor.text_segments:
                start_index = int(segment.start_index)
                end_index = int(segment.end_index)
                text += document.text[start_index:end_index]
    return text

def open_file_dialog():
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(title="Select Document AI JSON File", filetypes=[("JSON files", "*.json")])
    if file_path:
        # 示例：设置字体大小、图像透明度。不显示视觉元素
        convert_json_to_html(file_path, font_size=18, image_opacity=0.1, show_visual_elements=False)

if __name__ == "__main__":
    open_file_dialog()

(\ _ /)
( ･-･)
/っ就是这样。效果

KGLongWang · 2025 年2 月 27 日 05:49

能识别验证码吗

Yez · 2025 年2 月 27 日 05:57

感谢佬友分享

之前测试过一段时间Document AI，感觉如果深度或是生产使用有几个坑点，借佬友的帖子分享一下

第一个是谷歌的开发文档一如既往的糟糕，解析结果的json嵌套结构需要花时间理解；
第二个是Document AI的文件上传印象里只支持base64或者自家的云存储；

第三个是识别效果上，不论中英文，对一些自然段落（尤其是多栏结构或者复杂表格）的分隔不是太准确，需要二次处理

最后一个就是贵，基础版的Vision ai相对来说更划算一点

handsome · 2025 年2 月 27 日 06:44

你也太强了！

ati9527 · 2025 年2 月 27 日 07:48

帮顶帮顶

system · 2025 年3 月 29 日 07:49

此话题已在最后回复的 30 天后被自动关闭。不再允许新回复。

话题		回复	浏览量
国内主流大模型读取及处理文件能力测试搞七捻三人工智能	5	291	2025 年4 月 6 日
🔥【实践向】(๑•̀ㅂ•́)و✧ 用 Document AI 成功转录 403 页纯图扫描 PDF！开发调优 Gemini , 人工智能	19	1009	2025 年3 月 30 日
如何把一整个PDF的论文发给AI？资源荟萃人工智能 , 快问快答	25	935	2025 年4 月 7 日
PDF 转 Markdown 效果对比资源荟萃	34	2017	2025 年6 月 4 日
GLM OCR对比Deepseek OCR 2 开发调优人工智能	44	1412	2026 年2 月 28 日

🌋【GCP刨土系列】Document AI 带坐标的 OCR 识别

相关话题