Model Maximum Context Length Error
Hi there, I run an AI chat site (https://www.hammerai.com). I was previously using vLLM serverless, but switched over to using dedicated Pods with the vLLM template (
I then call it with:
Container Image: vllm/vllm-openai:latestContainer Image: vllm/vllm-openai:latest. Here is my configuration:--host 0.0.0.0 --port 8000 --model LoneStriker/Fimbulvetr-11B-v2-AWQ --enforce-eager --gpu-memory-utilization 0.95 --api-key foo --max-model-len 4096 --max-seq-len-to-capture 4096 --trust-remote-code --chat-template "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} {% for message in messages %} {% if message['role'] == 'user' %} ### Instruction: {{ message['content']|trim -}} {% if not loop.last %} {% endif %} {% elif message['role'] == 'assistant' %} ### Response: {{ message['content']|trim -}} {% if not loop.last %} {% endif %} {% elif message['role'] == 'user_context' %} ### Input: {{ message['content']|trim -}} {% if not loop.last %} {% endif %} {% endif %} {% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} ### Response: {% endif %}"--host 0.0.0.0 --port 8000 --model LoneStriker/Fimbulvetr-11B-v2-AWQ --enforce-eager --gpu-memory-utilization 0.95 --api-key foo --max-model-len 4096 --max-seq-len-to-capture 4096 --trust-remote-code --chat-template "{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} {% for message in messages %} {% if message['role'] == 'user' %} ### Instruction: {{ message['content']|trim -}} {% if not loop.last %} {% endif %} {% elif message['role'] == 'assistant' %} ### Response: {{ message['content']|trim -}} {% if not loop.last %} {% endif %} {% elif message['role'] == 'user_context' %} ### Input: {{ message['content']|trim -}} {% if not loop.last %} {% endif %} {% endif %} {% endfor %} {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} ### Response: {% endif %}"I then call it with:
import {convertToCoreMessages, streamText} from 'ai' // the vercel ai sdk
export async function POST(req: NextRequest): Promise<Response> {
...
// Depending on whether it is a chat or a completion, send `messages` or `prompt`:
const response = await streamText({
...(generateChat
? {messages: convertToCoreMessages(generateChat.messages)}
: {prompt: generateCompletion?.prompt}),import {convertToCoreMessages, streamText} from 'ai' // the vercel ai sdk
export async function POST(req: NextRequest): Promise<Response> {
...
// Depending on whether it is a chat or a completion, send `messages` or `prompt`:
const response = await streamText({
...(generateChat
? {messages: convertToCoreMessages(generateChat.messages)}
: {prompt: generateCompletion?.prompt}),