Vllm 转 Ollama 接口

发布时间：2026/6/24 3:00:11

最新版本VisualStudio已经可以接入其他大语言模型了通过管理模型的接口进入设置但是他不支持其他的vllm、llama.cpp接口而且ollama接口也只支持本地lhttp://localhost:11434,其他不支持好像到以上位置这个点击添加按钮没啥用改地址也不行直接用大模型帮忙写一个脚本直接做接口转换把VS请求的localhost接口转到已经部署的vllm接口其他接口类似就行但是需要注意的是要实现完整的ollama接口#!/usr/bin/env python3 Ollama to vLLM 简单接口转发 import json import os import sys from datetime import datetime from fastapi import FastAPI, Request, HTTPException from fastapi.responses import StreamingResponse, JSONResponse import httpx # 配置 VLLM_URL http://172.16.1.15:8000 VLLM_API_KEY # 如果有 API key 填这里 OLLAMA_NAME qwen3.6-27b # 本地 Ollama 接口显示的模型名 VLLM_MODEL /data/ai/models/Qwen3.6-27B # vLLM 上的模型名 # app FastAPI() client httpx.AsyncClient(timeout300.0) def log_request(method, url, bodyNone, headersNone): 打印请求日志 print(f\n{*60}, flushTrue) print(f[{datetime.now().strftime(%H:%M:%S)}] {method} {url}, flushTrue) if headers: print(fHeaders: {json.dumps(headers, ensure_asciiFalse)}, flushTrue) if body: print(fBody: {json.dumps(body, ensure_asciiFalse, indent2)}, flushTrue) print(f{*60}\n, flushTrue) def log_response(status_code, dataNone): 打印响应日志 print(f[{datetime.now().strftime(%H:%M:%S)}] Response: {status_code}, flushTrue) if data: print(fData: {json.dumps(data, ensure_asciiFalse, indent2)}, flushTrue) print(f{*60}\n, flushTrue) app.get(/api/tags) async def get_models(): 获取模型列表 - 直接调用 vLLM url f{VLLM_URL}/v1/models log_request(GET, url) resp await client.get(url) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: return {models: [{name: OLLAMA_NAME, model: OLLAMA_NAME}]} data resp.json() models [] for m in data.get(data, []): max_len m.get(max_model_len, 131072) models.append({ name: OLLAMA_NAME, model: OLLAMA_NAME, modified_at: datetime.utcfromtimestamp(m.get(created, 0)).strftime(%Y-%m-%dT%H:%M:%S) 00:00, size: 0, digest: , details: { parent_model: m.get(parent, ) or , format: vllm, family: qwen3, families: [qwen3], parameter_size: 27B, quantization_level: FP16, context_length: max_len, embedding_length: 5120 }, capabilities: [vision, completion, tools] }) return {models: models if models else [{name: OLLAMA_NAME, model: OLLAMA_NAME}]} app.get(/api/ps) async def running_models(): 列出正在运行的模型 return {models: [{name: OLLAMA_NAME, model: OLLAMA_NAME}]} app.post(/api/show) async def show_model(request: Request): Ollama 模型信息接口 - 直接调用 vLLM url f{VLLM_URL}/v1/models log_request(GET, url) resp await client.get(url) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: return {license: , modelfile: , parameters: } data resp.json() model_info data.get(data, [{}])[0] max_len model_info.get(max_model_len, 131072) # 构建参数字符串 params [ fmax_model_len: {max_len}, fcreated: {model_info.get(created, )}, fowned_by: {model_info.get(owned_by, )} ] return { license: , modelfile: fFROM {VLLM_MODEL}\n\nPARAMETER max_model_len {max_len}\n\nSYSTEM \\\You are a helpful assistant.\\\, parameters: \n.join(params), details: { parent_model: model_info.get(parent, ) or , format: vllm, family: qwen3, families: [qwen3], parameter_size: 27B, quantization_level: FP16, context_length: max_len, embedding_length: 5120 }, capabilities: [vision, completion, tools] } app.post(/api/chat) async def chat(request: Request): 聊天接口 - Ollama 格式转 vLLM 格式支持工具调用和图像理解 body await request.json() messages body.get(messages, []) stream body.get(stream, False) vllm_request { model: VLLM_MODEL, messages: messages, temperature: body.get(temperature, 1.0), top_p: body.get(top_p, 1.0), stream: stream } # 支持工具调用 if body.get(tools): vllm_request[tools] body[tools] # 支持工具结果 if body.get(tool_choice): vllm_request[tool_choice] body[tool_choice] # 支持响应格式 if body.get(response_format): vllm_request[response_format] body[response_format] if body.get(max_tokens): vllm_request[max_tokens] body[max_tokens] headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} if stream: log_request(POST, f{VLLM_URL}/v1/chat/completions, vllm_request, headers) return StreamingResponse( stream_response(vllm_request, headers), media_typeapplication/x-ndjson ) else: log_request(POST, f{VLLM_URL}/v1/chat/completions, vllm_request, headers) resp await client.post(f{VLLM_URL}/v1/chat/completions, jsonvllm_request, headersheaders) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) data resp.json() choice data[choices][0] result { model: OLLAMA_NAME, message: {role: assistant, content: choice[message][content]}, done: True } # 支持工具调用返回 if choice[message].get(tool_calls): result[message][tool_calls] choice[message][tool_calls] return result async def stream_response(vllm_request, headers): 流式响应 log_request(POST, f{VLLM_URL}/v1/chat/completions, vllm_request, headers) async with client.stream(POST, f{VLLM_URL}/v1/chat/completions, jsonvllm_request, headersheaders) as resp: print(f[{datetime.now().strftime(%H:%M:%S)}] 流式响应状态: {resp.status_code}, flushTrue) async for line in resp.aiter_lines(): if line.startswith(data: ): data line[6:] if data [DONE]: yield {done:true}\n break try: chunk json.loads(data) content chunk[choices][0].get(delta, {}).get(content, ) if content: yield json.dumps({model: OLLAMA_NAME, message: {role: assistant, content: content}, done: False}) \n except (json.JSONDecodeError, KeyError): continue app.post(/api/generate) async def generate(request: Request): 文本生成接口 body await request.json() vllm_request { model: VLLM_MODEL, prompt: body.get(prompt, ), temperature: body.get(temperature, 1.0), top_p: body.get(top_p, 1.0), stream: body.get(stream, False) } headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} if body.get(stream): log_request(POST, f{VLLM_URL}/v1/completions, vllm_request, headers) return StreamingResponse( stream_generate(vllm_request, headers), media_typeapplication/x-ndjson ) else: log_request(POST, f{VLLM_URL}/v1/completions, vllm_request, headers) resp await client.post(f{VLLM_URL}/v1/completions, jsonvllm_request, headersheaders) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) data resp.json() return { model: OLLAMA_NAME, response: data[choices][0][text], done: True } async def stream_generate(vllm_request, headers): 流式生成 log_request(POST, f{VLLM_URL}/v1/completions, vllm_request, headers) async with client.stream(POST, f{VLLM_URL}/v1/completions, jsonvllm_request, headersheaders) as resp: print(f[{datetime.now().strftime(%H:%M:%S)}] 流式生成状态: {resp.status_code}, flushTrue) async for line in resp.aiter_lines(): if line.startswith(data: ): data line[6:] if data [DONE]: yield {done:true}\n break try: chunk json.loads(data) text chunk[choices][0].get(text, ) if text: yield json.dumps({model: OLLAMA_NAME, response: text, done: False}) \n except (json.JSONDecodeError, KeyError): continue app.post(/api/embed) async def embed(request: Request): 生成文本嵌入向量 - 调用 vLLM body await request.json() inputs body.get(input, []) if not isinstance(inputs, list): inputs [inputs] headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} vllm_body {model: VLLM_MODEL, input: inputs} log_request(POST, f{VLLM_URL}/v1/embeddings, vllm_body, headers) resp await client.post( f{VLLM_URL}/v1/embeddings, jsonvllm_body, headersheaders ) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) data resp.json() embeddings [item.get(embedding, []) for item in data.get(data, [])] return {embeddings: embeddings} app.post(/api/embeddings) async def embeddings(request: Request): 旧版嵌入接口 - 调用 vLLM body await request.json() prompt body.get(prompt, ) headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} vllm_body {model: VLLM_MODEL, input: prompt} log_request(POST, f{VLLM_URL}/v1/embeddings, vllm_body, headers) resp await client.post( f{VLLM_URL}/v1/embeddings, jsonvllm_body, headersheaders ) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) data resp.json() embedding data.get(data, [{}])[0].get(embedding, []) return {embedding: embedding} app.post(/v1/chat/completions) async def v1_chat(request: Request): OpenAI 格式接口 - 直接透传到 vLLM body await request.json() body[model] VLLM_MODEL headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} if body.get(stream, False): log_request(POST, f{VLLM_URL}/v1/chat/completions, body, headers) return StreamingResponse( stream_v1_chat(body, headers), media_typetext/event-stream ) else: log_request(POST, f{VLLM_URL}/v1/chat/completions, body, headers) resp await client.post(f{VLLM_URL}/v1/chat/completions, jsonbody, headersheaders) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) return resp.json() async def stream_v1_chat(body, headers): OpenAI 格式流式响应 log_request(POST, f{VLLM_URL}/v1/chat/completions, body, headers) async with client.stream(POST, f{VLLM_URL}/v1/chat/completions, jsonbody, headersheaders) as resp: print(f[{datetime.now().strftime(%H:%M:%S)}] v1 流式响应状态: {resp.status_code}, flushTrue) async for line in resp.aiter_lines(): yield line \n app.post(/v1/completions) async def v1_completions(request: Request): OpenAI 格式补全接口 - 直接透传到 vLLM body await request.json() body[model] VLLM_MODEL headers {Content-Type: application/json} if VLLM_API_KEY: headers[Authorization] fBearer {VLLM_API_KEY} log_request(POST, f{VLLM_URL}/v1/completions, body, headers) resp await client.post(f{VLLM_URL}/v1/completions, jsonbody, headersheaders) log_response(resp.status_code, resp.json()) if resp.status_code ! 200: raise HTTPException(status_coderesp.status_code, detailresp.text) return resp.json() app.get(/v1/models) async def v1_models(): OpenAI 格式模型列表 url f{VLLM_URL}/v1/models log_request(GET, url) resp await client.get(url) log_response(resp.status_code, resp.json()) return resp.json() if __name__ __main__: import uvicorn print(启动 Ollama 代理: http://127.0.0.1:11434) print(f转发到 vLLM: {VLLM_URL}) uvicorn.run(app, host0.0.0.0, port11434)运行以上代码 python main.py然后回到VS自带模型点击添加出现实现的模型接口勾选模型点击保存就可以通过模型选择进行指定刚才设置的模型了

文章详情

Vllm 转 Ollama 接口

相关新闻

最新新闻

日新闻

周新闻

月新闻