diff --git a/server/api/chat_routes.py b/server/api/chat_routes.py index aaf6342..47460fb 100644 --- a/server/api/chat_routes.py +++ b/server/api/chat_routes.py @@ -10,7 +10,7 @@ from pathlib import Path from fastapi import HTTPException, File, UploadFile from fastapi.responses import JSONResponse, StreamingResponse import dashscope -from dashscope import Generation +from dashscope import Generation, MultiModalConversation # 导入模型和工具函数(使用绝对路径) import sys @@ -40,6 +40,14 @@ async def chat_endpoint_handler(body: dict): 这个端点会接收前端的聊天请求并转发到阿里云百炼API """ try: + # 确保 body 是字典类型 + if not isinstance(body, dict): + print(f"[ERROR] Request body is not a dictionary: {type(body)}") + raise HTTPException( + status_code=400, + detail=f"Request body must be a JSON object, got {type(body).__name__}: {body}" + ) + # 检查请求格式并适配 # 如果是OpenAI兼容格式 (来自streamChat) if 'messages' in body: @@ -69,93 +77,358 @@ async def chat_endpoint_handler(body: dict): temperature = body.get('temperature', 0.7) max_tokens = body.get('maxTokens', 2000) + # 检查是否包含图像内容,如果是多模态请求,使用MultiModalConversation + has_images = any( + isinstance(msg, dict) and + isinstance(msg.get('content'), list) and + any(isinstance(item, dict) and item.get('type') == 'image_url' for item in msg.get('content', [])) + for msg in messages if isinstance(msg, dict) + ) + + if has_images: + # 使用多模态API处理图像 + return await multimodal_chat_handler(messages, model, stream, temperature, max_tokens) + else: + # 使用常规聊天API + if stream: + # 流式响应 + async def event_generator(): + try: + responses = Generation.call( + model=model, + messages=messages, + stream=True, + max_tokens=max_tokens, + temperature=temperature + ) + + full_content = "" # 用于累计完整内容 + + for idx, response in enumerate(responses): + if response.status_code == 200: + # 检查响应是否包含预期的内容 + # DashScope API的响应结构可能是 output.choices 或 output.text + content = None + + # 尝试从 output.choices 获取内容 + if (hasattr(response, 'output') and + response.output and + hasattr(response.output, 'choices') and + response.output.choices is not None and + len(response.output.choices) > 0 and + 'message' in response.output.choices[0] and + 'content' in response.output.choices[0]['message']): + + content = response.output.choices[0]['message']['content'] + + # 只有当内容发生变化时才发送增量 + if len(content) > len(full_content): + delta_content = extract_delta_content(content, full_content) + full_content = content + + if delta_content.strip(): # 只有当有非空白新内容时才发送 + # 构建 SSE 数据块 + data = { + "id": f"chatcmpl-{generate_unique_id()}", + "object": "chat.completion.chunk", + "created": get_current_timestamp(), + "model": model, + "choices": [ + { + "index": 0, + "delta": {"content": delta_content}, + "finish_reason": None + } + ] + } + + yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n" + # 否则尝试从 output.text 获取内容(DashScope特定格式) + elif (hasattr(response, 'output') and + response.output and + 'text' in response.output): + + content = response.output.get('text') + + # 只有当内容发生变化时才发送增量 + if len(content) > len(full_content): + delta_content = extract_delta_content(content, full_content) + full_content = content + + if delta_content.strip(): # 只有当有非空白新内容时才发送 + # 构建 SSE 数据块 + data = { + "id": f"chatcmpl-{generate_unique_id()}", + "object": "chat.completion.chunk", + "created": get_current_timestamp(), + "model": model, + "choices": [ + { + "index": 0, + "delta": {"content": delta_content}, + "finish_reason": None + } + ] + } + + yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n" + else: + # 错误处理 + error_data = { + "error": { + "message": f"API Error: {response.code} - {response.message}", + "type": "api_error", + "param": None, + "code": response.code + } + } + yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n" + break + + # 发送结束信号 + finish_data = { + "id": f"chatcmpl-{generate_unique_id()}", + "object": "chat.completion.chunk", + "created": get_current_timestamp(), + "model": model, + "choices": [ + { + "index": 0, + "delta": {}, + "finish_reason": "stop" + } + ] + } + yield f"data: {json.dumps(finish_data, ensure_ascii=False)}\n\n" + yield "data: [DONE]\n\n" + except Exception as e: + error_data = { + "error": { + "message": str(e), + "type": "server_error" + } + } + yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n" + + return StreamingResponse(event_generator(), media_type="text/event-stream") + else: + # 非流式响应 + response = Generation.call( + model=model, + messages=messages, + stream=False, + max_tokens=max_tokens, + temperature=temperature + ) + + if response.status_code == 200: + # 检查响应是否包含预期的内容 + # DashScope API的响应结构可能是 output.choices 或 output.text + content = None + + # 尝试从 output.choices 获取内容 + if (hasattr(response, 'output') and + response.output and + hasattr(response.output, 'choices') and + response.output.choices is not None and + len(response.output.choices) > 0 and + 'message' in response.output.choices[0] and + 'content' in response.output.choices[0]['message']): + + content = response.output.choices[0]['message']['content'] + # 否则尝试从 output.text 获取内容(DashScope特定格式) + elif (hasattr(response, 'output') and + response.output and + 'text' in response.output): + + content = response.output.get('text') + + if content: + # 构建前端期望的响应格式 + chat_response = format_api_response( + content=content, + conversation_id=body.get('conversationId'), + model=model + ) + + if hasattr(response, 'usage') and response.usage: + chat_response["usage"] = { + "promptTokens": response.usage.input_tokens, + "completionTokens": response.usage.output_tokens, + "totalTokens": response.usage.total_tokens + } + + return JSONResponse(content=chat_response, ensure_ascii=False) + else: + raise HTTPException( + status_code=500, + detail="API Response does not contain expected content" + ) + else: + raise HTTPException( + status_code=500, + detail=f"API Error: {response.code} - {response.message}" + ) + + except Exception as e: + print(f"[ERROR] Error in chat endpoint: {str(e)}") + import traceback + print(f"[ERROR] Traceback: {traceback.format_exc()}") + raise HTTPException(status_code=500, detail=str(e)) + + +async def multimodal_chat_handler(messages, model, stream, temperature, max_tokens): + """ + 多模态聊天处理器 - 处理包含图像的消息 + """ + try: + # 将OpenAI格式的消息转换为DashScope MultiModalConversation格式 + dashscope_messages = [] + for i, msg in enumerate(messages): + # 验证 msg 是否为字典类型,如果不是则跳过或处理为字符串 + if not isinstance(msg, dict): + # 如果消息不是字典,将其作为纯文本处理 + dashscope_content = [ + {'text': str(msg)} + ] + dashscope_messages.append({ + 'role': 'user', + 'content': dashscope_content + }) + continue + + role = msg.get('role', 'user') + content = msg.get('content', '') + + if isinstance(content, str): + # 纯文本内容 + dashscope_content = [ + {'text': content} + ] + elif isinstance(content, list): + # 包含图像和文本的内容 + dashscope_content = [] + for j, item in enumerate(content): + if isinstance(item, dict): + if item.get('type') == 'text': + dashscope_content.append({'text': item.get('text', '')}) + elif item.get('type') == 'image_url': + # 处理 image_url 可能是字符串或字典两种情况 + image_url_value = item.get('image_url', '') + if isinstance(image_url_value, str): + # 如果 image_url 是字符串,直接使用 + img_url = image_url_value + elif isinstance(image_url_value, dict) and 'url' in image_url_value: + # 如果 image_url 是字典,从中获取 url + img_url = image_url_value.get('url', '') + else: + # 其他情况视为错误或空值 + img_url = '' + + # 如果URL是http格式,提取文件名并转换为file://格式 + if img_url.startswith('http://') or img_url.startswith('https://'): + # 提取URL中的文件名部分 (例如从 http://localhost:8000/uploads/filename.jpg 提取 uploads/filename.jpg) + from urllib.parse import urlparse + parsed_url = urlparse(img_url) + path_parts = parsed_url.path.split('/') + + # 从路径中找到uploads部分及后面的文件名 + try: + uploads_index = path_parts.index('uploads') + filename = '/'.join(path_parts[uploads_index:]) # 例如: uploads/filename.jpg + img_url = f"file://{filename}" + except ValueError: + # 如果路径中没有uploads部分,使用原始路径 + img_url = f"file://{parsed_url.path.lstrip('/')}" + elif not img_url.startswith('file://'): + # 如果既不是网络URL也不是file://协议,假设是相对路径 + img_url = f"file://{img_url}" + + if img_url.startswith('file://'): + # 确保本地文件存在 + import os + local_path = img_url[7:] # 移除 "file://" 前缀 + if not os.path.exists(local_path): + print(f"[WARNING] Image file does not exist: {local_path}") + + dashscope_content.append({'image': img_url}) + else: + # 将非字典内容转换为文本 + dashscope_content.append({'text': str(item)}) + else: + # 其他情况转换为文本 + dashscope_content = [ + {'text': str(content)} + ] + + dashscope_messages.append({ + 'role': role, + 'content': dashscope_content + }) + if stream: - # 流式响应 - async def event_generator(): + # 多模态流式响应 + async def multimodal_event_generator(): try: - responses = Generation.call( - model=model, - messages=messages, + responses = MultiModalConversation.call( + model=model.replace('qwen-', 'qwen-vl-') if 'qwen-' in model else 'qwen-vl-max', + messages=dashscope_messages, stream=True, max_tokens=max_tokens, temperature=temperature ) - full_content = "" # 用于累计完整内容 + full_content = "" - for idx, response in enumerate(responses): + for response in responses: if response.status_code == 200: - # 检查响应是否包含预期的内容 - # DashScope API的响应结构可能是 output.choices 或 output.text content = None - # 尝试从 output.choices 获取内容 + # 从多模态响应中提取内容 if (hasattr(response, 'output') and response.output and hasattr(response.output, 'choices') and response.output.choices is not None and len(response.output.choices) > 0 and - 'message' in response.output.choices[0] and - 'content' in response.output.choices[0]['message']): + 'message' in response.output.choices[0]): - content = response.output.choices[0]['message']['content'] + message = response.output.choices[0]['message'] + if 'content' in message: + content_items = message['content'] - # 只有当内容发生变化时才发送增量 - if len(content) > len(full_content): - delta_content = extract_delta_content(content, full_content) - full_content = content + # 从内容项中提取文本 + extracted_text = "" + for item in content_items: + if isinstance(item, dict) and 'text' in item: + extracted_text += item['text'] - if delta_content.strip(): # 只有当有非空白新内容时才发送 - # 构建 SSE 数据块 - data = { - "id": f"chatcmpl-{generate_unique_id()}", - "object": "chat.completion.chunk", - "created": get_current_timestamp(), - "model": model, - "choices": [ - { - "index": 0, - "delta": {"content": delta_content}, - "finish_reason": None - } - ] - } + content = extracted_text - yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n" - # 否则尝试从 output.text 获取内容(DashScope特定格式) - elif (hasattr(response, 'output') and - response.output and - 'text' in response.output): + # 只有当内容发生变化时才发送增量 + if len(content) > len(full_content): + delta_content = extract_delta_content(content, full_content) + full_content = content - content = response.output.get('text') + if delta_content.strip(): + data = { + "id": f"chatcmpl-{generate_unique_id()}", + "object": "chat.completion.chunk", + "created": get_current_timestamp(), + "model": model, + "choices": [ + { + "index": 0, + "delta": {"content": delta_content}, + "finish_reason": None + } + ] + } - # 只有当内容发生变化时才发送增量 - if len(content) > len(full_content): - delta_content = extract_delta_content(content, full_content) - full_content = content - - if delta_content.strip(): # 只有当有非空白新内容时才发送 - # 构建 SSE 数据块 - data = { - "id": f"chatcmpl-{generate_unique_id()}", - "object": "chat.completion.chunk", - "created": get_current_timestamp(), - "model": model, - "choices": [ - { - "index": 0, - "delta": {"content": delta_content}, - "finish_reason": None - } - ] - } - - yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n" + yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n" else: - # 错误处理 error_data = { "error": { - "message": f"API Error: {response.code} - {response.message}", + "message": f"Multimodal API Error: {response.code} - {response.message}", "type": "api_error", "param": None, "code": response.code @@ -164,7 +437,6 @@ async def chat_endpoint_handler(body: dict): yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n" break - # 发送结束信号 finish_data = { "id": f"chatcmpl-{generate_unique_id()}", "object": "chat.completion.chunk", @@ -189,68 +461,56 @@ async def chat_endpoint_handler(body: dict): } yield f"data: {json.dumps(error_data, ensure_ascii=False)}\n\n" - return StreamingResponse(event_generator(), media_type="text/event-stream") + return StreamingResponse(multimodal_event_generator(), media_type="text/event-stream") else: - # 非流式响应 - response = Generation.call( - model=model, - messages=messages, + # 多模态非流式响应 + response = MultiModalConversation.call( + model=model.replace('qwen-', 'qwen-vl-') if 'qwen-' in model else 'qwen-vl-max', + messages=dashscope_messages, stream=False, max_tokens=max_tokens, temperature=temperature ) if response.status_code == 200: - # 检查响应是否包含预期的内容 - # DashScope API的响应结构可能是 output.choices 或 output.text content = None - # 尝试从 output.choices 获取内容 if (hasattr(response, 'output') and response.output and hasattr(response.output, 'choices') and response.output.choices is not None and len(response.output.choices) > 0 and - 'message' in response.output.choices[0] and - 'content' in response.output.choices[0]['message']): + 'message' in response.output.choices[0]): - content = response.output.choices[0]['message']['content'] - # 否则尝试从 output.text 获取内容(DashScope特定格式) - elif (hasattr(response, 'output') and - response.output and - 'text' in response.output): + message = response.output.choices[0]['message'] + if 'content' in message: + content_items = message['content'] - content = response.output.get('text') + # 从内容项中提取文本 + extracted_text = "" + for item in content_items: + if isinstance(item, dict) and 'text' in item: + extracted_text += item['text'] + + content = extracted_text if content: - # 构建前端期望的响应格式 - chat_response = format_api_response( - content=content, - conversation_id=body.get('conversationId'), - model=model - ) - - if hasattr(response, 'usage') and response.usage: - chat_response["usage"] = { - "promptTokens": response.usage.input_tokens, - "completionTokens": response.usage.output_tokens, - "totalTokens": response.usage.total_tokens - } - - return JSONResponse(content=chat_response, ensure_ascii=False) + return JSONResponse(content={"result": content}, ensure_ascii=False) else: raise HTTPException( status_code=500, - detail="API Response does not contain expected content" + detail="Multimodal API Response does not contain expected content" ) else: raise HTTPException( status_code=500, - detail=f"API Error: {response.code} - {response.message}" + detail=f"Multimodal API Error: {response.code} - {response.message}" ) except Exception as e: - print(f"[ERROR] Error in chat endpoint: {str(e)}") + print(f"[ERROR] Error in multimodal chat handler: {str(e)}") + import traceback + print(f"[ERROR] Traceback: {traceback.format_exc()}") raise HTTPException(status_code=500, detail=str(e)) @@ -277,6 +537,20 @@ async def get_models_handler(): description="速度更快、成本更低", maxTokens=8192, provider="Aliyun" + ), + ModelInfo( + id="qwen-vl-max", + name="通义万相 VL-Max", + description="支持视觉理解的多模态模型", + maxTokens=8192, + provider="Aliyun" + ), + ModelInfo( + id="qwen-vl-plus", + name="通义万相 VL-Plus", + description="支持视觉理解的多模态模型", + maxTokens=8192, + provider="Aliyun" ) ] return [model.dict() for model in models]