vllm推理qwen14B、qwen2.5-vl-32B
【代码】vllm推理qwen14、qwen2.5-vl-32B
·
一、vllm启动模型
# 1. 设置可见 GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3
# 2. 设置 vLLM 使用 spawn 启动方式(需 vLLM 0.5.4+)
export VLLM_USE_SPAWN=True
# 3. 启动服务(多卡部署)
vllm serve modelscope/models/Qwen/Qwen2.5-VL-32B-Instruct --dtype bfloat16 --tensor-parallel-size 4 --gpu-memory-utilization 0.99 --max-model-len 4096 --enforce-eager --host 0.0.0.0 --port 7860
二、qwen14B_vllm
# -*- coding:utf-8 -*-
from openai import OpenAI
def qwen14B_vllm(prompt, inputdata):
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
chat_response = client.chat.completions.create(
model="/root/autodl-tmp/cache/modelscope/hub/Qwen/Qwen2___5-14B-Instruct-GPTQ-Int8",
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": inputdata},
],
temperature=0.7,
top_p=0.8,
max_tokens=8192,
extra_body={
"repetition_penalty": 1.05,
},
)
content = chat_response.choices[0].message.content
# print("Chat response content:\n", content)
return content
if __name__ == '__main__':
prompt = "你是政务公文专家,请把用户输入的公文输出为正确的格式。只输出正文,不要添加多余内容。"
inputdata = "公文内容"
qwen14B_vllm(prompt, inputdata)
三、qwen2.5-vl-32B
1、使用requests方式调用
import requests
import json
import base64
def encode_image(image_path): # 编码本地图片的函数
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# 1.url
url = 'http://192.168.2.21:7860/v1/chat/completions'
# 2.data
image_path = "1.jpg"
base64_image = encode_image(image_path) # 编码本地图片
data = {"model": "/mnt/workspace/.cache/modelscope/models/Qwen/Qwen2.5-VL-32B-Instruct",
"messages": [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
{"type": "text", "text": "请描述图片中的内容"}, ], }],
"temperature": 0.7,
"top_p": 0.8,
"repetition_penalty": 1.05,
"max_tokens": 2048}
# data = {
# "model": "/mnt/workspace/.cache/modelscope/models/Qwen/Qwen2.5-VL-32B-Instruct",
# "messages": [
# {
# "role": "user",
# "content": "What is the capital of China?"
# }
# ]
# }
# 3.将字典转换为 JSON 字符串
json_payload = json.dumps(data)
# 4.发送 POST 请求
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json_payload, headers=headers)
# 5.打印响应内容
print(response.json().get("choices", [])[0].get("message", []).get("content", [])) # 命令行启动,用这个打印
# print(response.json())
2、使用openai方式
from openai import OpenAI
import base64
def encode_image(image_path): # 编码本地图片的函数
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
client = OpenAI(
base_url="http://192.168.2.21:7860/v1/",
api_key="0",
)
image_path = "1.jpg"
base64_image = encode_image(image_path) # 编码本地图片
completion = client.chat.completions.create(
model="/mnt/workspace/.cache/modelscope/models/Qwen/Qwen2.5-VL-32B-Instruct",
messages=[
# {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
{"type": "text", "text": "请描述图片中的内容"}, ], }],
temperature=0.7,
top_p=0.8,
max_tokens=2048
)
print(completion.choices[0])
print(completion.choices[0].message.content)
四、qwen14B-多线程
# -*- coding:utf-8 -*-
import json
from qwen14B_vllm import qwen14B_vllm
from concurrent.futures import ThreadPoolExecutor, as_completed
from itertools import count
def filter_strings(input_file, output_file):
prompt = "你是政务公文专家,请把用户输入的公文输出为正确的格式。只输出正文,不要添加多余内容。"
def process_target(json_line):
target = json_line.get('target', '')
# 调用qwen14B处理公文格式
target = qwen14B_vllm(prompt, target)
json_line['target'] = target
return json_line
output_data = []
processed_count = 0 # 初始化计数器
with open(input_file, 'r', encoding='utf-8') as file:
lines = [line for line in file]
total_items = len(lines) # 总条目数
print(f"Total items to process: {total_items}")
with ThreadPoolExecutor(max_workers=11) as executor:
futures = {executor.submit(process_target, json.loads(line)): line for line in lines}
# 使用count()创建一个迭代器用于跟踪处理进度
counter = count(1)
for future in as_completed(futures):
try:
result = future.result()
output_data.append(result)
processed_count = next(counter) # 更新计数器
if processed_count % 10 == 0: # 每处理10条数据打印一次进度
print(f"Processed {processed_count} out of {total_items} items.")
except Exception as exc:
print(f"Generated an exception: {exc}")
# 打印最终处理完成的信息
print(f"Processing completed. Total processed items: {processed_count}")
# 将修改后的数据写回output_file文件
with open(output_file, 'w', encoding='utf-8') as file:
for json_line in output_data:
file.write(json.dumps(json_line, ensure_ascii=False) + '\n')
if __name__ == '__main__':
input_file = 'gongwen_test.jsonl' # 输入文件名
output_file = 'gongwen_test_01.jsonl' # 输出文件名
filter_strings(input_file, output_file)
更多推荐



所有评论(0)