在高通跃龙IQ-9075上部署端侧OCR工业仪表识别系统(3): 板端部署与实时识别

weixin_38498942

392人浏览 · 2026-04-15 19:02:46

weixin_38498942 · 2026-04-15 19:02:46 发布

前言
经过前两篇的模型导出和QNN编译，我们已经拿到了可在NPU上运行的模型文件。本篇将把这些模型部署到高通跃龙IQ-9075板子上，编写完整的实时推理脚本，并给出性能数据和工程化建议。

1. 部署模型文件到板端

把模型文件和字典文件推送到板子上：

ssh root@$DEVICE_IP "mkdir -p /opt/ocr_models"

scp ~/ocr_models/qnn/det/ch_ppocr_v4_det_ctx.bin \
    root@$DEVICE_IP:/opt/ocr_models/

scp ~/ocr_models/qnn/rec/ch_ppocr_v4_rec_ctx.bin \
    root@$DEVICE_IP:/opt/ocr_models/

scp ~/ocr_models/qnn/det/aarch64-ubuntu-gcc9.4/libch_ppocr_v4_det.so \
    root@$DEVICE_IP:/opt/ocr_models/

scp ~/ocr_models/qnn/rec/aarch64-ubuntu-gcc9.4/libch_ppocr_v4_rec.so \
    root@$DEVICE_IP:/opt/ocr_models/

scp ~/ocr_models/ppocr_keys_v1.txt \
    root@$DEVICE_IP:/opt/ocr_models/

2. 验证NPU推理

先用 qnn-net-run 工具做一次快速验证，确认模型能在HTP后端正常跑起来：

ssh root@$DEVICE_IP << 'RUNCMD'
cd /opt/ocr_models

qnn-net-run \
    --retrieve_context ch_ppocr_v4_det_ctx.bin \
    --backend /opt/qnn/lib/libQnnHtp.so \
    --input_list /opt/ocr_models/test_det_input.txt \
    --output_dir /opt/ocr_models/det_output/

echo "检测模型NPU推理验证完成"

qnn-net-run \
    --retrieve_context ch_ppocr_v4_rec_ctx.bin \
    --backend /opt/qnn/lib/libQnnHtp.so \
    --input_list /opt/ocr_models/test_rec_input.txt \
    --output_dir /opt/ocr_models/rec_output/

echo "识别模型NPU推理验证完成"
RUNCMD

没有报错就说明模型在NPU上能正常执行。

3. 板端实时OCR推理脚本

下面是完整的板端推理Python脚本，实现了从摄像头采集到OCR输出的全流程。

在这里插入图片描述

把以下脚本保存为 /opt/ocr_models/ocr_infer.py，在高通IQ-9075板子上运行：

import cv2
import numpy as np
import time
import os
import subprocess
import struct

class QNNInferEngine:
    """封装qnn-net-run调用，提供简洁的推理接口"""

    def __init__(self, ctx_bin, backend_lib, input_shape, output_shapes):
        self.ctx_bin = ctx_bin
        self.backend_lib = backend_lib
        self.input_shape = input_shape
        self.output_shapes = output_shapes
        self.tmp_dir = "/tmp/qnn_ocr"
        os.makedirs(self.tmp_dir, exist_ok=True)

    def infer(self, input_data):
        input_path = os.path.join(self.tmp_dir, "input.raw")
        input_list = os.path.join(self.tmp_dir, "input_list.txt")
        output_dir = os.path.join(self.tmp_dir, "output")
        os.makedirs(output_dir, exist_ok=True)

        input_data.astype(np.float32).tofile(input_path)
        with open(input_list, "w") as f:
            f.write(input_path + "\n")

        cmd = [
            "qnn-net-run",
            "--retrieve_context", self.ctx_bin,
            "--backend", self.backend_lib,
            "--input_list", input_list,
            "--output_dir", output_dir,
        ]
        subprocess.run(cmd, capture_output=True, check=True)

        outputs = []
        for shape in self.output_shapes:
            out_files = sorted([
                f for f in os.listdir(os.path.join(output_dir, "Result_0"))
                if f.endswith(".raw")
            ])
            if out_files:
                data = np.fromfile(
                    os.path.join(output_dir, "Result_0", out_files[len(outputs)]),
                    dtype=np.float32
                )
                outputs.append(data.reshape(shape))
        return outputs


class OCRPipeline:
    """端到端OCR流水线：检测 → 裁剪 → 识别 → 解码"""

    def __init__(self, model_dir="/opt/ocr_models"):
        backend = "/opt/qnn/lib/libQnnHtp.so"

        self.det_engine = QNNInferEngine(
            ctx_bin=os.path.join(model_dir, "ch_ppocr_v4_det_ctx.bin"),
            backend_lib=backend,
            input_shape=(1, 3, 960, 960),
            output_shapes=[(1, 1, 960, 960)]
        )

        self.rec_engine = QNNInferEngine(
            ctx_bin=os.path.join(model_dir, "ch_ppocr_v4_rec_ctx.bin"),
            backend_lib=backend,
            input_shape=(1, 3, 48, 320),
            output_shapes=[(1, 40, 6625)]
        )

        dict_path = os.path.join(model_dir, "ppocr_keys_v1.txt")
        with open(dict_path, "r", encoding="utf-8") as f:
            self.char_dict = ["blank"] + [line.strip() for line in f.readlines()] + [" "]

        self.det_size = 960
        self.rec_h = 48
        self.rec_w = 320

    def preprocess_det(self, img):
        h, w = img.shape[:2]
        self.ratio = self.det_size / max(h, w)
        new_h = int(h * self.ratio)
        new_w = int(w * self.ratio)
        resized = cv2.resize(img, (new_w, new_h))

        padded = np.zeros((self.det_size, self.det_size, 3), dtype=np.float32)
        padded[:new_h, :new_w, :] = resized

        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        padded = (padded / 255.0 - mean) / std
        padded = padded.transpose(2, 0, 1)[np.newaxis, :]
        return padded.astype(np.float32)

    def postprocess_det(self, prob_map, orig_shape):
        binary = (prob_map[0, 0] > 0.3).astype(np.uint8)
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        boxes = []
        h_orig, w_orig = orig_shape[:2]
        for cnt in contours:
            x, y, bw, bh = cv2.boundingRect(cnt)
            if bw < 8 or bh < 8:
                continue
            x1 = max(0, int(x / self.ratio) - 2)
            y1 = max(0, int(y / self.ratio) - 2)
            x2 = min(w_orig, int((x + bw) / self.ratio) + 2)
            y2 = min(h_orig, int((y + bh) / self.ratio) + 2)
            boxes.append((x1, y1, x2, y2))
        return boxes

    def preprocess_rec(self, crop_img):
        resized = cv2.resize(crop_img, (self.rec_w, self.rec_h))
        resized = resized.astype(np.float32)
        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        resized = (resized / 255.0 - mean) / std
        resized = resized.transpose(2, 0, 1)[np.newaxis, :]
        return resized.astype(np.float32)

    def ctc_decode(self, rec_output):
        preds = rec_output[0]
        pred_indices = np.argmax(preds, axis=-1)

        result = []
        prev_idx = 0
        for idx in pred_indices:
            if idx != 0 and idx != prev_idx:
                if idx < len(self.char_dict):
                    result.append(self.char_dict[idx])
            prev_idx = idx
        return "".join(result)

    def run(self, frame):
        t0 = time.time()

        det_input = self.preprocess_det(frame)
        t1 = time.time()

        det_output = self.det_engine.infer(det_input)
        t2 = time.time()

        boxes = self.postprocess_det(det_output[0], frame.shape)
        t3 = time.time()

        results = []
        rec_total = 0
        for (x1, y1, x2, y2) in boxes:
            crop = frame[y1:y2, x1:x2]
            if crop.size == 0:
                continue

            rec_input = self.preprocess_rec(crop)
            tr0 = time.time()
            rec_output = self.rec_engine.infer(rec_input)
            tr1 = time.time()
            rec_total += (tr1 - tr0)

            text = self.ctc_decode(rec_output[0])
            if len(text) > 0:
                results.append({
                    "bbox": (x1, y1, x2, y2),
                    "text": text,
                    "confidence": float(np.max(rec_output[0][0], axis=-1).mean())
                })
        t4 = time.time()

        timing = {
            "det_preprocess_ms": (t1 - t0) * 1000,
            "det_infer_ms": (t2 - t1) * 1000,
            "det_postprocess_ms": (t3 - t2) * 1000,
            "rec_total_ms": rec_total * 1000,
            "total_ms": (t4 - t0) * 1000,
        }
        return results, timing


def main():
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)

    if not cap.isOpened():
        print("错误：无法打开摄像头")
        return

    pipeline = OCRPipeline()
    frame_count = 0
    print("OCR推理启动，按Ctrl+C停止...")

    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                print("读取帧失败，跳过")
                continue

            results, timing = pipeline.run(frame)
            frame_count += 1

            print(f"\n--- 第{frame_count}帧 ---")
            print(f"耗时：检测预处理{timing['det_preprocess_ms']:.1f}ms | "
                  f"检测推理{timing['det_infer_ms']:.1f}ms | "
                  f"识别总计{timing['rec_total_ms']:.1f}ms | "
                  f"端到端{timing['total_ms']:.1f}ms")

            for r in results:
                x1, y1, x2, y2 = r["bbox"]
                print(f"[{x1},{y1},{x2},{y2}] \"{r['text']}\" ({r['confidence']:.2f})")
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, r["text"], (x1, y1 - 5),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

            out_path = f"/tmp/ocr_output/frame_{frame_count:06d}.jpg"
            os.makedirs("/tmp/ocr_output", exist_ok=True)
            cv2.imwrite(out_path, frame)

    except KeyboardInterrupt:
        print(f"\n停止。共处理{frame_count}帧")
    finally:
        cap.release()

if __name__ == "__main__":
    main()

终端输出示例：

OCR推理启动，按Ctrl+C停止...

--- 第1帧 ---
耗时：检测预处理12.3ms | 检测推理45.7ms | 识别总计28.4ms | 端到端91.2ms
[234,156,412,198] "2.45MPa" (0.96)
[420,312,580,348] "压力表" (0.92)

--- 第2帧 ---
耗时：检测预处理11.8ms | 检测推理44.2ms | 识别总计26.1ms | 端到端86.9ms
[231,158,415,200] "2.44MPa" (0.95)
[418,310,582,350] "压力表" (0.91)

检测模型在NPU上单帧推理约45ms，识别模型约14ms/个文字区域，端到端延迟控制在90ms以内，达到了10+ FPS的实时水平。

4. 性能数据汇总

实测环境：IQ-9075 EVK，USB 1080P工业摄像头，室内日光灯照明。

环节	耗时（ms）	执行单元
图像采集	~5	CPU
检测预处理(resize+normalize)	~12	CPU
检测模型推理	~45	NPU (HTP)
检测后处理（二值化+轮廓提取）	~3	CPU
识别预处理（裁剪+resize）	~2/区域	CPU
识别模型推理	~14/区域	NPU (HTP)