vllm推理qwen14B、qwen2.5-vl-32B

【代码】vllm推理qwen14、qwen2.5-vl-32B

asdzdh

1348人浏览 · 2024-12-09 15:54:22

asdzdh · 2024-12-09 15:54:22 发布

一、vllm启动模型

# 1. 设置可见 GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3

# 2. 设置 vLLM 使用 spawn 启动方式（需 vLLM 0.5.4+）
export VLLM_USE_SPAWN=True

# 3. 启动服务（多卡部署）

vllm serve modelscope/models/Qwen/Qwen2.5-VL-32B-Instruct --dtype bfloat16 --tensor-parallel-size 4 --gpu-memory-utilization 0.99 --max-model-len 4096 --enforce-eager --host 0.0.0.0 --port 7860

二、qwen14B_vllm

# -*- coding:utf-8 -*-

from openai import OpenAI


def qwen14B_vllm(prompt, inputdata):
    # Set OpenAI's API key and API base to use vLLM's API server.
    openai_api_key = "EMPTY"
    openai_api_base = "http://localhost:8000/v1"

    client = OpenAI(
        api_key=openai_api_key,
        base_url=openai_api_base,
    )

    chat_response = client.chat.completions.create(
        model="/root/autodl-tmp/cache/modelscope/hub/Qwen/Qwen2___5-14B-Instruct-GPTQ-Int8",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": inputdata},
        ],
        temperature=0.7,
        top_p=0.8,
        max_tokens=8192,
        extra_body={
            "repetition_penalty": 1.05,
        },
    )

    content = chat_response.choices[0].message.content
    # print("Chat response content:\n", content)
    return content


if __name__ == '__main__':
    prompt = "你是政务公文专家，请把用户输入的公文输出为正确的格式。只输出正文，不要添加多余内容。"
    inputdata = "公文内容"

    qwen14B_vllm(prompt, inputdata)

三、qwen2.5-vl-32B

1、使用requests方式调用

import requests
import json
import base64


def encode_image(image_path):  # 编码本地图片的函数
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


# 1.url
url = 'http://192.168.2.21:7860/v1/chat/completions'

# 2.data
image_path = "1.jpg"
base64_image = encode_image(image_path)  # 编码本地图片
data = {"model": "/mnt/workspace/.cache/modelscope/models/Qwen/Qwen2.5-VL-32B-Instruct",
        "messages": [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user",
             "content": [
                 {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                 {"type": "text", "text": "请描述图片中的内容"}, ], }],
        "temperature": 0.7,
        "top_p": 0.8, 
        "repetition_penalty": 1.05, 
        "max_tokens": 2048}

# data = {
#     "model": "/mnt/workspace/.cache/modelscope/models/Qwen/Qwen2.5-VL-32B-Instruct",
#     "messages": [
#         {
#             "role": "user",
#             "content": "What is the capital of China?"
#         }
#     ]
# }

# 3.将字典转换为 JSON 字符串
json_payload = json.dumps(data)

# 4.发送 POST 请求
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json_payload, headers=headers)

# 5.打印响应内容
print(response.json().get("choices", [])[0].get("message", []).get("content", []))  # 命令行启动，用这个打印
# print(response.json())

2、使用openai方式

from openai import OpenAI
import base64


def encode_image(image_path):  # 编码本地图片的函数
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


client = OpenAI(
    base_url="http://192.168.2.21:7860/v1/",
    api_key="0",
)

image_path = "1.jpg"
base64_image = encode_image(image_path)  # 编码本地图片

completion = client.chat.completions.create(
    model="/mnt/workspace/.cache/modelscope/models/Qwen/Qwen2.5-VL-32B-Instruct",
    messages=[
        # {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user",
         "content": [
             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
             {"type": "text", "text": "请描述图片中的内容"}, ], }],
    temperature=0.7,
    top_p=0.8,
    max_tokens=2048
)

print(completion.choices[0])
print(completion.choices[0].message.content)

四、qwen14B-多线程

# -*- coding:utf-8 -*-
import json
from qwen14B_vllm import qwen14B_vllm
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import count


def filter_strings(input_file, output_file):
    prompt = "你是政务公文专家，请把用户输入的公文输出为正确的格式。只输出正文，不要添加多余内容。"

    def process_target(json_line):
        target = json_line.get('target', '')
        # 调用qwen14B处理公文格式
        target = qwen14B_vllm(prompt, target)
        json_line['target'] = target
        return json_line

    output_data = []
    processed_count = 0  # 初始化计数器

    with open(input_file, 'r', encoding='utf-8') as file:
        lines = [line for line in file]

    total_items = len(lines)  # 总条目数
    print(f"Total items to process: {total_items}")

    with ThreadPoolExecutor(max_workers=11) as executor:
        futures = {executor.submit(process_target, json.loads(line)): line for line in lines}

        # 使用count()创建一个迭代器用于跟踪处理进度
        counter = count(1)

        for future in as_completed(futures):
            try:
                result = future.result()
                output_data.append(result)
                processed_count = next(counter)  # 更新计数器
                if processed_count % 10 == 0:  # 每处理10条数据打印一次进度
                    print(f"Processed {processed_count} out of {total_items} items.")
            except Exception as exc:
                print(f"Generated an exception: {exc}")

        # 打印最终处理完成的信息
        print(f"Processing completed. Total processed items: {processed_count}")

    # 将修改后的数据写回output_file文件
    with open(output_file, 'w', encoding='utf-8') as file:
        for json_line in output_data:
            file.write(json.dumps(json_line, ensure_ascii=False) + '\n')


if __name__ == '__main__':
    input_file = 'gongwen_test.jsonl'  # 输入文件名
    output_file = 'gongwen_test_01.jsonl'  # 输出文件名

    filter_strings(input_file, output_file)

MCP技术社区

欢迎加入 MCP 技术社区！与志同道合者携手前行，一同解锁 MCP 技术的无限可能！

更多推荐

8种封装的1700V国产碳化硅(SiC)功率模块产品介绍及应用

MCP技术社区

（一篇入门）汽车电子电器之电机MCU控制器四

MCP技术社区

基于Echarts的甘特图实现与封装实战

Echarts（Enterprise Charts）是由百度开源的一款功能强大、高度可定制的JavaScript数据可视化库，广泛应用于各类企业级Web应用中。其核心设计理念是“以数据驱动视图”，通过声明式配置即可实现复杂图表的渲染与交互。在现代前端工程化背景下，Echarts不仅支持静态图表展示，更具备动态更新、大数据量处理和跨平台兼容等高级能力，成为构建高性能数据看板、实时监控系统和项目管理工